In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re

In [2]:
def load_data(file_path):
    """Loads the SMS spam dataset from the given file path."""
    df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
    return df

In [3]:
def preprocess_text(text):
    """Preprocess text by converting to lowercase, removing special characters, and extra whitespace."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
def prepare_data(df):
    """Prepares data by preprocessing text, encoding labels, and splitting into train/validation/test."""
    df = df.copy()
    df['message'] = df['message'].apply(preprocess_text)
    df['label'] = (df['label'] == 'spam').astype(int)
    df['message'] = df['message'].fillna('')
    
    train_val, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
    train, validation = train_test_split(train_val, test_size=0.25, random_state=42, stratify=train_val['label'])
    
    train.to_csv('train.csv', index=False)
    validation.to_csv('validation.csv', index=False)
    test.to_csv('test.csv', index=False)
    
    print(f"Data split sizes:")
    print(f"Train: {len(train)} samples")
    print(f"Validation: {len(validation)} samples")
    print(f"Test: {len(test)} samples")
    
    return train, validation, test

In [5]:
if __name__ == "__main__":
    file_path = 'sms+spam+collection/SMSSpamCollection'
    df = load_data(file_path)
    train, validation, test = prepare_data(df)

Data split sizes:
Train: 3342 samples
Validation: 1115 samples
Test: 1115 samples
