# Spam Detection - Feature Engineering

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
import pandas as pd
import numpy as np
import nltk

nltk.download('punkt')

from nltk.tokenize import word_tokenize, sent_tokenize
from src.utils import load_config, get_project_root, save_as_csv

In [None]:
config = load_config()

train_path = config['data']['task1']['processed']['train']
test_path = config['data']['task1']['processed']['test']

processed_train_data = os.path.join(get_project_root(), train_path.replace('/', os.sep), "spam_detection_train_processed.csv")
processed_test_data = os.path.join(get_project_root(), test_path.replace('/', os.sep), "spam_detection_test_processed.csv")

train_df = pd.read_csv(processed_train_data)
test_df = pd.read_csv(processed_test_data)

## 1. Feature Functions

In [None]:
def special_char_count(text):
    return sum(1 for c in text if c in "!@#$%^&*()[]{};:,.<>?/|\\`~-=+")

In [None]:
def exclamation_density(text):
    return text.count('!') / max(len(text), 1)

In [None]:
def uppercase_ratio(text):
    words = text.split()
    return sum(1 for word in words if word.isupper()) / len(words) if words else 0

In [None]:
def avg_sentence_length(text):
    sentences = sent_tokenize(text)
    return sum(len(word_tokenize(s)) for s in sentences) / len(sentences) if sentences else 0

In [None]:
def punctuation_density(text):
    punctuation = '.,!?;:'
    return sum(1 for c in text if c in punctuation) / max(len(text), 1)

In [None]:
def vocabulary_richness(text):
    words = word_tokenize(text)
    if not words:
        return 0
    return len(set(words)) / len(words)

In [None]:
def marketing_keyword_count(text, keywords=None):
    if keywords is None:
        keywords = ["free", "offer", "buy", "click", "win", "limited", "cash", "deal"]
    tokens = word_tokenize(text.lower())
    return sum(1 for word in tokens if word in keywords)

## 2. Apply Features

In [None]:
for df in [train_df, test_df]:
    df['text_length'] = df['clean_text'].apply(len)
    df['word_count'] = df['clean_text'].apply(lambda x: len(word_tokenize(x)))
    df['special_char_count'] = df['clean_text'].apply(special_char_count)
    df['exclamation_density'] = df['clean_text'].apply(exclamation_density)
    df['uppercase_ratio'] = df['clean_text'].apply(uppercase_ratio)
    df['avg_sentence_length'] = df['clean_text'].apply(avg_sentence_length)
    df['punctuation_density'] = df['clean_text'].apply(punctuation_density)
    df['vocabulary_richness'] = df['clean_text'].apply(vocabulary_richness)
    df['marketing_keyword_count'] = df['clean_text'].apply(marketing_keyword_count)

In [None]:
train_df.head()

In [None]:
test_df.head()

## 3. Inspect Example Rows

In [None]:
train_df[['label', 'clean_text', 'text_length', 'word_count', 'special_char_count', 'exclamation_density', 'uppercase_ratio', 'avg_sentence_length', 'punctuation_density', 'vocabulary_richness', 'marketing_keyword_count']].head()

## 4. Save Engineered Data

In [None]:
save_as_csv(train_df, os.path.join(get_project_root(), train_path.replace('/', os.sep)), "spam_detection_train_processed_features.csv")
save_as_csv(test_df, os.path.join(get_project_root(), test_path.replace('/', os.sep)), "spam_detection_test_processed_features.csv")