# Spam Detection - Data Preprocessing

- Add the project's root directory (two levels up) to the Python path so the modules can be imported, even if they arent in the current working directory:

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

- Import the required libraries and modules, as well as our utility functions:

In [None]:
import pandas as pd
import numpy as np
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from src.utils import load_config, print_text, save_as_csv, confirm_checksum, get_project_root

- Load the config using the utility function. Get paths to relevant folders/files needed to save and retrieve files:

In [None]:
config = load_config()

raw_test_data_checksum = config['data']['task1']['raw']['test_checksum']
raw_train_data_checksum = config['data']['task1']['raw']['train_checksum']

raw_test_data_path = config['data']['task1']['raw']['test']
raw_train_data_path = config['data']['task1']['raw']['train']

processed_test_data_path = config['data']['task1']['processed']['test']
processed_train_data_path = config['data']['task1']['processed']['train']

raw_test_data = os.path.join(get_project_root(), raw_test_data_path.replace('/', os.sep))
raw_train_data = os.path.join(get_project_root(), raw_train_data_path.replace('/', os.sep))
processed_test_data = os.path.join(get_project_root(), processed_test_data_path.replace('/', os.sep))
processed_train_data = os.path.join(get_project_root(), processed_train_data_path.replace('/', os.sep))

- Using the provided checksums, check that the test and training data are correctly loaded - ensures consistency with provided files:

In [None]:
if confirm_checksum(raw_test_data, raw_test_data_checksum) and confirm_checksum(raw_train_data, raw_train_data_checksum):
    print("Training and Testing Data Loaded Correctly!")

- Load the CSV data into dataframes:

In [None]:
train_df = pd.read_csv(raw_train_data)
test_df = pd.read_csv(raw_test_data)

- Word tokenise the text in both datasets and give this a new column called tokens:

In [None]:
train_df['tokens'] = train_df['text'].apply(word_tokenize)
test_df['tokens'] = test_df['text'].apply(word_tokenize)

- Inspect the dataframes to get an idea of what is going on inside the data:

In [None]:
train_df.head()

In [None]:
test_df.head()

- For consistency, we apply lowercasing to the tokens:

In [None]:
train_df['tokens'] = train_df['tokens'].apply(lambda tokens: [word.lower() for word in tokens])
test_df['tokens'] = test_df['tokens'].apply(lambda tokens: [word.lower() for word in tokens])

- As per the lab exercises, we replace numerical and ordinal data with Nth or NUM:

In [None]:
def num_convert(tokens):
    return [
        "Nth" if (token.endswith(("nd", "st", "th")) and token[:-2].isdigit())
        else "NUM" if token.isdigit()
        else token
        for token in tokens
    ]

- Apply the above conversion function:

In [None]:
train_df['tokens'] = train_df['tokens'].apply(num_convert)
test_df['tokens'] = test_df['tokens'].apply(num_convert)

- As per the lab exercises, we want to remove any stopwords - they will not be useful in this model:

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
train_df['tokens'] = train_df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
test_df['tokens'] = test_df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

- As learnt from the data analysis, we want to keep symbols as they can help us detect spam or not. We filter out anything else:

In [None]:
def filter_tokens(tokens):
    allowed_symbols = {"!", "?", "$", "%", "&", "@", "*"}
    return [word for word in tokens if word.isalpha() or word in {"NUM", "Nth"} or word in allowed_symbols]

In [None]:
train_df['tokens'] = train_df['tokens'].apply(filter_tokens)
test_df['tokens'] = test_df['tokens'].apply(filter_tokens)

- Use lemmatising and stemming to put words into their base form - makes it more efficient and consistent when training our model:

In [None]:
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
def numerical_or_ordinal(token):
    allowed = {"NUM", "Nth", "!", "?", "$", "%", "&", "@", "*"}
    return token in allowed

In [None]:
def lemmatise_then_stem(tokens):
    lemmatised = [lemmatiser.lemmatize(token) if not numerical_or_ordinal(token) else token for token in tokens]
    stemmed = [stemmer.stem(token) if not numerical_or_ordinal(token) else token for token in lemmatised]
    return stemmed

In [None]:
train_df['tokens'] = train_df['tokens'].apply(lemmatise_then_stem)
test_df['tokens'] = test_df['tokens'].apply(lemmatise_then_stem)

- Inspect the dataframes to get an idea of what is going on inside the data:

In [None]:
train_df.head()

In [None]:
test_df.head()

- Reconstruct to make it cleaner and easier to use

In [None]:
train_df['clean_text'] = train_df['tokens'].apply(lambda tokens: ' '.join(tokens))
test_df['clean_text'] = test_df['tokens'].apply(lambda tokens: ' '.join(tokens))

- Inspect a few samples

In [None]:
train_samples = train_df.sample(3)

for _, row in train_samples.iterrows():
    print_text(row['clean_text'], row['label'])

- Inspect the dataframes to get an idea of what is going on inside the data:

In [None]:
train_df.head()

In [None]:
test_df.head()

- Drop columns that don't matter. We don't need to use tokens anymore as we have processed the data. Text will not be useful because it is not processed:

In [None]:
train_df = train_df.drop(['tokens'], axis=1)
test_df = test_df.drop(['tokens'], axis=1)

- Save the data to the required location with specified file name:

In [None]:
save_as_csv(train_df, processed_train_data, "spam_detection_train_processed.csv")
save_as_csv(test_df, processed_test_data, "spam_detection_test_processed.csv")