# Assignment 2 - prepare.ipynb
Data version control with DVC for raw and split SMS spam data.

In [1]:
import shutil
import subprocess
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

## Utility Functions

In [2]:
def run_cmd(cmd: str) -> None:
    print(f"$ {cmd}")
    result = subprocess.run(cmd, shell=True, text=True, capture_output=True)
    if result.stdout:
        print(result.stdout)
    if result.returncode != 0:
        if result.stderr:
            print(result.stderr)
        raise RuntimeError(f'Command failed: {cmd}')


def load_data(file_path: str) -> pd.DataFrame:
    """Load SMS spam data from a file path."""
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"Data file not found: {file_path}")

    # First attempt standard CSV with header.
    try:
        df_csv = pd.read_csv(path)
        if {'label', 'message'}.issubset(df_csv.columns):
            return df_csv[['label', 'message']].copy()
    except Exception:
        pass

    # Fallback: UCI SMSSpamCollection format (tab-separated, no header).
    df_tsv = pd.read_csv(path, sep='	', header=None, names=['label', 'message'])
    if {'label', 'message'}.issubset(df_tsv.columns):
        return df_tsv[['label', 'message']].copy()

    raise ValueError('Expected columns label/message or SMSSpamCollection tab-separated format.')

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    data = df[['label', 'message']].dropna().copy()
    data['label'] = data['label'].astype(str).str.strip().str.lower()
    data['message'] = data['message'].astype(str).str.strip()
    data = data[data['message'] != '']
    data['target'] = (data['label'] == 'spam').astype(int)
    return data[['message', 'target']]


def split_data(df: pd.DataFrame, random_state: int):
    train_df, temp_df = train_test_split(
        df,
        test_size=0.30,
        random_state=random_state,
        stratify=df['target'],
    )
    validation_df, test_df = train_test_split(
        temp_df,
        test_size=0.50,
        random_state=random_state,
        stratify=temp_df['target'],
    )
    return train_df.reset_index(drop=True), validation_df.reset_index(drop=True), test_df.reset_index(drop=True)


def save_splits(train_df: pd.DataFrame, validation_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
    train_df.to_csv('train.csv', index=False)
    validation_df.to_csv('validation.csv', index=False)
    test_df.to_csv('test.csv', index=False)


def print_target_distribution(file_path: str) -> None:
    df = pd.read_csv(file_path)
    counts = df['target'].value_counts().to_dict()
    count_0 = int(counts.get(0, 0))
    count_1 = int(counts.get(1, 0))
    print(f"{file_path}: 0s={count_0}, 1s={count_1}, total={len(df)}")


def save_dvc_snapshot(version_dir: str) -> None:
    out = Path(version_dir)
    out.mkdir(parents=True, exist_ok=True)
    for fname in ['raw_data.csv.dvc', 'train.csv.dvc', 'validation.csv.dvc', 'test.csv.dvc']:
        src = Path(fname)
        if src.exists():
            shutil.copy2(src, out / fname)


def restore_dvc_snapshot(version_dir: str) -> None:
    src_dir = Path(version_dir)
    for fname in ['raw_data.csv.dvc', 'train.csv.dvc', 'validation.csv.dvc', 'test.csv.dvc']:
        src = src_dir / fname
        if src.exists():
            shutil.copy2(src, Path(fname))
    run_cmd('dvc checkout')

## Initialize DVC and Create Version 1 Data

In [3]:
# Path to raw source file from assignment 1.
RAW_SOURCE_PATH = '../assignment 1/sms+spam+collection/SMSSpamCollection'

if not Path('.dvc').exists():
    run_cmd('dvc init --subdir')

raw_original = load_data(RAW_SOURCE_PATH)
raw_original.to_csv('raw_data.csv', index=False)

prepared = preprocess_data(raw_original)
train_df, validation_df, test_df = split_data(prepared, random_state=42)
save_splits(train_df, validation_df, test_df)

Path('params.yaml').write_text('random_state: 42\n', encoding='utf-8')
run_cmd('dvc add raw_data.csv train.csv validation.csv test.csv')
save_dvc_snapshot('dvc_versions/v1')

print('Version 1 created with random_state=42')

$ dvc init --subdir


Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/treeverse/dvc>

$ dvc add raw_data.csv train.csv validation.csv test.csv



To track the changes with git, run:

	git add .gitignore test.csv.dvc train.csv.dvc validation.csv.dvc raw_data.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

Version 1 created with random_state=42


## Update Split with a Different Random Seed (Version 2)

In [4]:
train_df, validation_df, test_df = split_data(prepared, random_state=7)
save_splits(train_df, validation_df, test_df)

Path('params.yaml').write_text('random_state: 7\n', encoding='utf-8')
run_cmd('dvc add train.csv validation.csv test.csv')
save_dvc_snapshot('dvc_versions/v2')

print('Version 2 created with random_state=7')

$ dvc add train.csv validation.csv test.csv



To track the changes with git, run:

	git add test.csv.dvc validation.csv.dvc train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

Version 2 created with random_state=7


## Checkout Version 1 and Print Target Distribution

In [5]:
restore_dvc_snapshot('dvc_versions/v1')
print_target_distribution('train.csv')
print_target_distribution('validation.csv')
print_target_distribution('test.csv')

$ dvc checkout


M       test.csv
M       train.csv
M       validation.csv
A       dvc_versions/v1/raw_data.csv
A       dvc_versions/v1/test.csv
A       dvc_versions/v1/train.csv
A       dvc_versions/v1/validation.csv
A       dvc_versions/v2/raw_data.csv
A       dvc_versions/v2/test.csv
A       dvc_versions/v2/train.csv
A       dvc_versions/v2/validation.csv

train.csv: 0s=3377, 1s=523, total=3900
validation.csv: 0s=724, 1s=112, total=836
test.csv: 0s=724, 1s=112, total=836


## Checkout Version 2 and Print Target Distribution

In [6]:
restore_dvc_snapshot('dvc_versions/v2')
print_target_distribution('train.csv')
print_target_distribution('validation.csv')
print_target_distribution('test.csv')

$ dvc checkout


M       test.csv
M       train.csv
M       validation.csv

train.csv: 0s=3377, 1s=523, total=3900
validation.csv: 0s=724, 1s=112, total=836
test.csv: 0s=724, 1s=112, total=836
