##### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2025 Semester 1

## Assignment 1: Scam detection with naive Bayes


**Student ID(s):**     `1352062`


This iPython notebook is a template which you will use for your Assignment 1 submission.

**NOTE: YOU SHOULD ADD YOUR RESULTS, GRAPHS, AND FIGURES FROM YOUR OBSERVATIONS IN THIS FILE TO YOUR REPORT (the PDF file).** Results, figures, etc. which appear in this file but are NOT included in your report will not be marked.

**Adding proper comments to your code is MANDATORY. **

## 0. Infrastructure and setup

In [22]:
import numpy as np
import pandas as pd
import math

In [23]:
def return_tokens(processed_text):
    """
    Given a preprocessed string/instance, return its tokens and their counts.
    """
    token_counts = dict()
    
    if isinstance(processed_text, str):
        # Split the string into words/tokens
        processed_text = processed_text.split(" ")
        # Count the occurrences of each token
        for token in processed_text:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1
    else:
        # Empty string classified as a float for some reason
        token_counts = {"": 1}

    return token_counts

In [24]:
def find_vocabulary(data):
    """
    Find the list of every word which occurs in the  dataset (every word in textPreprocessed)
    """
    vocabulary = []

    for i in range(len(data)):
        processed_text = return_tokens(data.loc[i, "textPreprocessed"]).keys()
        vocabulary = vocabulary + list(processed_text)

    vocabulary = sorted(list(set(vocabulary)))
    return vocabulary

In [25]:
def find_count_matrix(data, vocabulary):
    """
    Find the count matrix, which is a matrix of size N*V where N is the number of instances in the training data 
    and V is the number of words in the vocabulary.  
    Each cell in the matrix represents the number of times a given word appeared in a given message 
    """

    N = len(data)
    V = len(vocabulary)

    # Intialise matrix N*V with 0s
    count_matrix = np.zeros((N,V))
    count_matrix = pd.DataFrame(count_matrix)
    count_matrix.columns = vocabulary

    # Iterate over each row - instance 
    for i in range(N):
        instance_token_counts = return_tokens(data.loc[i, "textPreprocessed"])
        for token in instance_token_counts.keys():
            count_matrix.loc[i, token] = instance_token_counts[token]
    
    return count_matrix

In [26]:
def find_prior_prob(data):
    """
    Returns a dictionary of the prior probability of each class P(class)
    """
    prior_probs = dict()

    labels = np.unique(data["class"])
    for label in labels:
        prior_probs[int(label)] = len(data[data["class"]==label])/len(data)

    return prior_probs

In [27]:
def calc_p_c_i(data, count_matrix, label, word, V=0, alpha=0):
    """
    p(word | label) = count(c,i) + alpha / total(c) + V*alpha
    count(c,i) is the total count of times word i appears in messages from class c
    total(c) is the total count of words in class c
    Alpha is smoothing factor used for Laplace smoothing, defaulted to 0 (no smoothing)
    V is the length of the count vector of a test instance
    """
    label_indexes = data[data["class"] == label].index
    count_matrix = count_matrix.iloc[label_indexes]
    word_counts = np.sum(count_matrix[word]) + alpha
    label_counts = np.sum(count_matrix, axis=0).sum() + (V*alpha)
    
    return float(word_counts / label_counts)

In [28]:
def find_conditional_prob(data, vocabulary, count_matrix):
    """
    Return dictionary conditional probabilities of each token in a class
    """
    conditional_probs = {int(label): None for label in data["class"].unique()}
    for label in conditional_probs.keys():
        conditional_probs[label] = {word: 0 for word in vocabulary}

        for word in vocabulary:
            conditional_probs[label][word] = calc_p_c_i(data, count_matrix, label, word, V=len(vocabulary))
    
    return conditional_probs

In [29]:
def find_test_count_vector(test_text, vocabulary):
    """
    Find the count vector of a test instance
    """
    count_vector = np.zeros((1, len(vocabulary)))[0]
    test_token_counts = return_tokens(test_text)

    for i in range(len(vocabulary)):
        token = vocabulary[i]
        if token in test_token_counts.keys():
            count_vector[i] = test_token_counts[token]
    
    return count_vector

In [30]:
def find_posterior_prob(data, count_vector, vocabulary, prior_probs, conditional_probs):
    """
    Find posterior probability P(class = label | count = count vector) -> P(class = label) * P(count|c)
    """
    posterior_probs = {int(label): None for label in data["class"].unique()}
    for label in posterior_probs.keys():
        p_class = prior_probs[label]
        p_count_c = math.factorial(int(sum(count_vector))) / math.prod([math.factorial(int(x)) for x in count_vector])
        test_token_indexes = [i for i in range(len(count_vector)) if count_vector[i] != 0]

        for token_index in test_token_indexes:
            p_c_i = conditional_probs[label][vocabulary[token_index]]
            p_count_c = p_count_c * math.pow(p_c_i, count_vector[token_index])

        p_c_count = p_class * p_count_c
        posterior_probs[label] = p_c_count

    return posterior_probs

In [31]:
def classify_test_instance(data, vocabulary, test_text, prior_probs, conditional_probs):
    """
    Given a string, classify it as 0 - non malicious, or 1 - scam
    """
    test_count_vector = find_test_count_vector(test_text, vocabulary)

    # Count vector all zeroes so cannot classify
    if np.count_nonzero(test_count_vector) == 0:
        return None
    
    test_posterior_prob = find_posterior_prob(data, test_count_vector, vocabulary, prior_probs, conditional_probs)

    # Find which class has higher likelihood
    best_label = None
    best_prob = 0

    # If both are equal, return 0.5, hard coded 
    if test_posterior_prob[0] == test_posterior_prob[1]:
        return 0.5

    for label, prob_label in test_posterior_prob.items():
        if prob_label > best_prob:
            best_prob = prob_label
            best_label = label
    
    return best_label

## 1. Supervised model training


In [32]:
train = pd.read_csv("sms_supervised_train.csv")

In [33]:
vocabulary = find_vocabulary(train)

In [34]:
count_matrix = find_count_matrix(train, vocabulary)

In [35]:
prior_probs = find_prior_prob(train)

In [36]:
conditional_probs = find_conditional_prob(train, vocabulary, count_matrix)

In [37]:
# check if prob within each class label sum = 1
for label in conditional_probs.keys():
    print((label, sum(conditional_probs[label].values())))
# good enough

(0, 1.0000000000000027)
(1, 0.9999999999999903)


## 2. Supervised model evaluation

In [38]:
test = pd.read_csv("sms_test.csv")

In [39]:
test_instances = test["textPreprocessed"]
predicted_labels = []
for test_instance in test_instances:
    predicted_label = classify_test_instance(test, vocabulary, test_instance, prior_probs, conditional_probs)
    predicted_labels.append(predicted_label)

test["predicted class"] = predicted_labels

In [40]:
display(test)

Unnamed: 0,textOriginal,textPreprocessed,class,predicted class
0,What's up? Do you want me to come online? If y...,? ? up come online free talk sometime �,0,0.0
1,I don't quite know what to do. I still can't g...,? . . up know quite still get hold anyone cud ...,0,0.0
2,"House-Maid is the murderer, coz the man was mu...",".. , house-maid murderer coz man murder 26th j...",0,0.0
3,Ok i thk i got it. Then u wan me 2 come now or...,? . 2 come u get ok thk wan wat,0,0.0
4,Hello from Orange. For 1 month's free access t...,"1 . . reply free free , , , message 's yes tex...",1,1.0
...,...,...,...,...
995,Well i will watch shrek in 3D!!B),! ! ) well b watch 3d,0,0.0
996,"Babe, I'm back ... Come back to me ...","come , back back ... ... babe",0,0.0
997,Got it! It looks scrumptious... daddy wants to...,get look ... ! ! eat night want long daddy,0,0.0
998,Especially since i talk about boston all up in...,". up talk , ! ! change personal lol especially...",0,0.5


In [41]:
for i in range(len(test)):
    text = test.loc[i, "textPreprocessed"]
    text_tokens = return_tokens(text)
    all_tokens_not_in_vocab = True

    for token in text_tokens:
        if token in vocabulary:
            all_tokens_not_in_vocab = False
            break

    if all_tokens_not_in_vocab:
        print(test.loc[i, "textPreprocessed"])

## 3. Extending the model with semi-supervised training

In [42]:
unlabelled = pd.read_csv("sms_unlabelled.csv")

## 4. Supervised model evaluation