# Spam Detection - Feature Engineering

- Add the project's root directory (two levels up) to the Python path so the modules can be imported, even if they arent in the current working directory:

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

- Import the required libraries and modules, as well as our utility functions:

In [None]:
import pandas as pd
import numpy as np
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize, sent_tokenize
from src.utils import load_config, get_project_root, save_as_csv

- Load the config using the utility function. Get paths to relevant folders/files needed to save and retrieve files:

In [None]:
config = load_config()

train_path = config['data']['task1']['processed']['train']
test_path = config['data']['task1']['processed']['test']

processed_train_data = os.path.join(get_project_root(), train_path.replace('/', os.sep), "spam_detection_train_processed.csv")
processed_test_data = os.path.join(get_project_root(), test_path.replace('/', os.sep), "spam_detection_test_processed.csv")

train_df = pd.read_csv(processed_train_data)
test_df = pd.read_csv(processed_test_data)

- A function to count how many special characters are used - a useful feature as spam messages tend to overuse special chars:

In [None]:
def special_char_count(text):
    special_chars = "!@#$%^&*()[]{};:,.<>?/|\\`~-=+"
    return sum(1 for c in text if c in special_chars)

- Spam messages often use a lot of exclamation marks. This is useful when training our model, so we extract the density of exclamation to the rest of the text:

In [None]:
def exclamation_density(text):
    length = len(text)
    return text.count('!') / max(length, 1)

- A function that takes the ratio of uppercase letters in the text as a ratio to all of the text - may be redundant due to lowercasing, but also use of "NUM".

In [None]:
def uppercase_ratio(text):
    words = text.split()
    length = len(words)
    return sum(1 for word in words if word.isupper()) / length if words else 0

- As discovered from data analysis, the average sentence length of spam messages is longer than not spam - this function extracts that feature by getting the average number of words per sentence, using "word_tokenize" to count words in each sentence:

In [None]:
def avg_sentence_length(text):
    sentences = sent_tokenize(text)
    length = len(sentences)
    return sum(len(word_tokenize(s)) for s in sentences) / length if sentences else 0

- Density of punctuation in our text:

In [None]:
def punctuation_density(text):
    punctuation = '.,!?;:'
    return sum(1 for c in text if c in punctuation) / max(len(text), 1)

In [None]:
def vocabulary_richness(text):
    words = word_tokenize(text)
    if not words:
        return 0
    return len(set(words)) / len(words)

- Spam messages often use a lot of marketing keywords - by using a list of some common ones, we can potentially detect spam content in a message. I will put them in base form to match the preprocessed data:

In [None]:
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
def lemmatise_then_stem(word):
    lemmatised = lemmatiser.lemmatize(word)
    return stemmer.stem(lemmatised)

In [None]:
def marketing_keyword_count(text):
    keywords = ["free", "offer", "buy", "click", "win", "limit", "cash", "deal", "discount", "invest", "bonus", "lose", "money", "credit", "service", "price", "product"]
    base_keywords = set(lemmatise_then_stem(word) for word in keywords)

    tokens = word_tokenize(text.lower())
    return sum(1 for word in tokens if word in base_keywords)

- Apply features to both training and test dataframes:

In [None]:
for df in [train_df, test_df]:
    df['text_length'] = df['clean_text'].apply(len)
    df['word_count'] = df['clean_text'].apply(lambda x: len(word_tokenize(x)))
    df['special_char_count'] = df['clean_text'].apply(special_char_count)
    df['exclamation_density'] = df['clean_text'].apply(exclamation_density)
    df['uppercase_ratio'] = df['clean_text'].apply(uppercase_ratio)
    df['avg_sentence_length'] = df['clean_text'].apply(avg_sentence_length)
    df['punctuation_density'] = df['clean_text'].apply(punctuation_density)
    df['vocabulary_richness'] = df['clean_text'].apply(vocabulary_richness)
    df['marketing_keyword_count'] = df['clean_text'].apply(marketing_keyword_count)

- Inspect the dataframes to get an idea of what is going on inside the data:

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df[['label', 'clean_text', 'text_length', 'word_count', 'special_char_count', 'exclamation_density', 'uppercase_ratio', 'avg_sentence_length', 'punctuation_density', 'vocabulary_richness', 'marketing_keyword_count']].head()

- Save the data to the required location with specified file name:

In [None]:
save_as_csv(train_df, os.path.join(get_project_root(), train_path.replace('/', os.sep)), "spam_detection_train_processed_features.csv")
save_as_csv(test_df, os.path.join(get_project_root(), test_path.replace('/', os.sep)), "spam_detection_test_processed_features.csv")