# Spam Detection - Data Preprocessing

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
import pandas as pd
import numpy as np
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from src.utils import load_config, print_text, save_as_csv, confirm_checksum, get_project_root

## 1. Data Loading

In [None]:
config = load_config()

raw_test_data_checksum = config['data']['task1']['raw']['test_checksum']
raw_train_data_checksum = config['data']['task1']['raw']['train_checksum']

raw_test_data_path = config['data']['task1']['raw']['test']
raw_train_data_path = config['data']['task1']['raw']['train']

processed_test_data_path = config['data']['task1']['processed']['test']
processed_train_data_path = config['data']['task1']['processed']['train']

raw_test_data = os.path.join(get_project_root(), raw_test_data_path.replace('/', os.sep))
raw_train_data = os.path.join(get_project_root(), raw_train_data_path.replace('/', os.sep))
processed_test_data = os.path.join(get_project_root(), processed_test_data_path.replace('/', os.sep))
processed_train_data = os.path.join(get_project_root(), processed_train_data_path.replace('/', os.sep))

In [None]:
if confirm_checksum(raw_test_data, raw_test_data_checksum) and confirm_checksum(raw_train_data, raw_train_data_checksum):
    print("Training and Testing Data Loaded Correctly!")

In [None]:
train_df = pd.read_csv(raw_train_data)
test_df = pd.read_csv(raw_test_data)

## 2. Tokenisation

In [None]:
train_df['tokens'] = train_df['text'].apply(word_tokenize)
test_df['tokens'] = test_df['text'].apply(word_tokenize)

In [None]:
train_df.head()

In [None]:
test_df.head()

## 3. Lowercasing

In [None]:
train_df['tokens'] = train_df['tokens'].apply(lambda tokens: [word.lower() for word in tokens])
test_df['tokens'] = test_df['tokens'].apply(lambda tokens: [word.lower() for word in tokens])

## 4. Number Normalisation

In [None]:
def num_convert(tokens):
    return [
        "Nth" if (token.endswith(("nd", "st", "th")) and token[:-2].isdigit())
        else "NUM" if token.isdigit()
        else token
        for token in tokens
    ]

In [None]:
train_df['tokens'] = train_df['tokens'].apply(num_convert)
test_df['tokens'] = test_df['tokens'].apply(num_convert)

## 5. Stopword Removal

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
train_df['tokens'] = train_df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
test_df['tokens'] = test_df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

## 6. Filter and Clean Tokens

In [None]:
def filter_tokens(tokens):
    allowed_symbols = {"!", "?", "$", "%", "&", "@", "*"}
    return [word for word in tokens if word.isalpha() or word in {"NUM", "Nth"} or word in allowed_symbols]

In [None]:
train_df['tokens'] = train_df['tokens'].apply(filter_tokens)
test_df['tokens'] = test_df['tokens'].apply(filter_tokens)

## 7. Lemmatisation and Stemming

In [None]:
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
def numerical_or_ordinal(token):
    return token in {"NUM", "Nth", "!", "?", "$", "%", "&", "@", "*"}

In [None]:
def lemmatise_then_stem(tokens):
    lemmatised = [lemmatiser.lemmatize(token) if not numerical_or_ordinal(token) else token for token in tokens]
    stemmed = [stemmer.stem(token) if not numerical_or_ordinal(token) else token for token in lemmatised]
    return stemmed

In [None]:
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

train_df['tokens'] = train_df['tokens'].apply(lemmatise_then_stem)
test_df['tokens'] = test_df['tokens'].apply(lemmatise_then_stem)

In [None]:
train_df.head()

In [None]:
test_df.head()

## 8. Reconstruct and Sampling

- Reconstruct to make it cleaner and easier to use

In [None]:
train_df['clean_text'] = train_df['tokens'].apply(lambda tokens: ' '.join(tokens))
test_df['clean_text'] = test_df['tokens'].apply(lambda tokens: ' '.join(tokens))

- Inspect a few samples

In [None]:
train_samples = train_df.sample(3)

for _, row in train_samples.iterrows():
    print_text(row['clean_text'], row['label'])

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df = train_df.drop(['text', 'tokens'], axis=1)
test_df = test_df.drop(['text', 'tokens'], axis=1)

## 9. Save Data

In [None]:
save_as_csv(train_df, processed_train_data, "spam_detection_train_processed.csv")
save_as_csv(test_df, processed_test_data, "spam_detection_test_processed.csv")