# Assignment 1 - prepare.ipynb
Build reusable data preparation functions for SMS spam classification.

In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

## Functions

In [2]:
def load_data(file_path: str) -> pd.DataFrame:
    """Load SMS spam data from a file path."""
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"Data file not found: {file_path}")

    # First attempt standard CSV with header.
    try:
        df_csv = pd.read_csv(path)
        if {'label', 'message'}.issubset(df_csv.columns):
            return df_csv[['label', 'message']].copy()
    except Exception:
        pass

    # Fallback: UCI SMSSpamCollection format (tab-separated, no header).
    df_tsv = pd.read_csv(path, sep='	', header=None, names=['label', 'message'])
    if {'label', 'message'}.issubset(df_tsv.columns):
        return df_tsv[['label', 'message']].copy()

    raise ValueError('Expected columns label/message or SMSSpamCollection tab-separated format.')

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """Clean raw columns and create binary target."""
    data = df.copy()
    data = data[['label', 'message']].dropna()
    data['label'] = data['label'].astype(str).str.strip().str.lower()
    data['message'] = data['message'].astype(str).str.strip()
    data = data[data['message'] != '']
    data['target'] = (data['label'] == 'spam').astype(int)
    return data[['message', 'target']]


def split_data(
    df: pd.DataFrame,
    random_state: int = 42,
    train_size: float = 0.70,
    val_size: float = 0.15,
    test_size: float = 0.15,
):
    """Split into train/validation/test with stratification."""
    total = train_size + val_size + test_size
    if abs(total - 1.0) > 1e-8:
        raise ValueError('train_size + val_size + test_size must equal 1.0')

    train_df, temp_df = train_test_split(
        df,
        test_size=(1 - train_size),
        random_state=random_state,
        stratify=df['target'],
    )

    val_ratio_within_temp = val_size / (val_size + test_size)
    val_df, test_df = train_test_split(
        temp_df,
        test_size=(1 - val_ratio_within_temp),
        random_state=random_state,
        stratify=temp_df['target'],
    )

    return train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)


def save_splits(train_df: pd.DataFrame, validation_df: pd.DataFrame, test_df: pd.DataFrame, output_dir: str = '.') -> None:
    """Store train/validation/test CSV files."""
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    train_df.to_csv(out / 'train.csv', index=False)
    validation_df.to_csv(out / 'validation.csv', index=False)
    test_df.to_csv(out / 'test.csv', index=False)


def print_distribution(name: str, split_df: pd.DataFrame) -> None:
    counts = split_df['target'].value_counts().to_dict()
    print(f"{name}: shape={split_df.shape}, target_0={counts.get(0, 0)}, target_1={counts.get(1, 0)}")

## Run Preparation Pipeline

In [3]:
# Update this path if your raw dataset is elsewhere.
DATA_PATH = '../assignment 1/sms+spam+collection/SMSSpamCollection'

raw_df = load_data(DATA_PATH)
prepared_df = preprocess_data(raw_df)

train_df, validation_df, test_df = split_data(prepared_df, random_state=42)
save_splits(train_df, validation_df, test_df, output_dir='.')

print_distribution('train', train_df)
print_distribution('validation', validation_df)
print_distribution('test', test_df)

print('\nSaved files: train.csv, validation.csv, test.csv')

train: shape=(3900, 2), target_0=3377, target_1=523
validation: shape=(836, 2), target_0=724, target_1=112
test: shape=(836, 2), target_0=724, target_1=112

Saved files: train.csv, validation.csv, test.csv
