In [1]:
import numpy as np
import pandas as pd

import re, nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from tabulate import tabulate

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bower\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Collecting and cleaning data

In [2]:
sms_data, email_data = [], []
sms_count, email_count = 0, 0

# SMS DATA

# from: https://www.kaggle.com/datasets/galactus007/sms-smishing-collection-data-set/
df = pd.read_csv("datasets/sms_messaging.csv")
df = df.loc[df["label"] == 1]

sms_count += len(df)
sms_data = np.concatenate((sms_data, np.array(df["sms"])), axis=0)

# from: https://data.mendeley.com/datasets/f45bkkt8pr/1
df = pd.read_csv("datasets/sms_phishing.csv")
df = df.loc[df["LABEL"] == "Smishing"]

sms_count += len(df)
sms_data = np.concatenate((sms_data, np.array(df["TEXT"])), axis=0)

# EMAIL DATA

# from: https://www.kaggle.com/datasets/subhajournal/phishingemails
df = pd.read_csv("datasets/Phishing_Email.csv")
df = df.loc[df["Email Type"] == "Phishing Email"]

email_count += len(df)
email_data = np.concatenate((email_data, (np.array(df["Email Text"]))), axis=0)

In [3]:
def clean_text(text):
    text = str(text)
    # Remove punctuation using regex
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    
    return text

clean_sms = [clean_text(text) for text in sms_data]
clean_email = [clean_text(text) for text in email_data]

## Non-negative matrix factorization

In [4]:
def nmf_analysis(data):

    # preprocess data
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data)

    # nmf
    nmf = NMF(n_components=5, random_state=42)
    nmf.fit(tfidf_matrix)
    
    # get top words for each topic
    feature_names = vectorizer.get_feature_names_out()

    topic_keywords = []
    for index, topic in enumerate(nmf.components_):
        # get top 10 words
        words = topic.argsort()[:-10 - 1:-1]  
        top_words = [feature_names[i] for i in words]
        topic_keywords.append(top_words)
    
    # print table
    table_data = np.transpose(topic_keywords)
    
    table_headers = ['Topic ' + str(i) for i in range(len(topic_keywords))]
    table = tabulate(table_data, headers=table_headers, tablefmt="fancy_grid")
    
    print(table)

nmf_analysis(clean_sms)
nmf_analysis(clean_email)    

╒════════════╤═════════════╤═════════════╤═══════════╤════════════╕
│ Topic 0    │ Topic 1     │ Topic 2     │ Topic 3   │ Topic 4    │
╞════════════╪═════════════╪═════════════╪═══════════╪════════════╡
│ prize      │ free        │ secret      │ receive   │ identifier │
├────────────┼─────────────┼─────────────┼───────────┼────────────┤
│ guaranteed │ reply       │ admirer     │ award     │ unredeemed │
├────────────┼─────────────┼─────────────┼───────────┼────────────┤
│ urgent     │ video       │ thinks      │ selected  │ points     │
├────────────┼─────────────┼─────────────┼───────────┼────────────┤
│ claim      │ 08000930705 │ ufind       │ claim     │ expires    │
├────────────┼─────────────┼─────────────┼───────────┼────────────┤
│ valid      │ new         │ rreveal     │ 350       │ statement  │
├────────────┼─────────────┼─────────────┼───────────┼────────────┤
│ 12hrs      │ mins        │ specialcall │ ending    │ account    │
├────────────┼─────────────┼─────────────┼──────