# 01 - Data Loading and Sanity Checks

This notebook:
- Loads the raw AG News training dataset
- Creates or loads a frozen train/validation split
- Runs sanity checks (counts, labels, leakage risks)
- Verifies that the data pipeline produces valid batches

Purpose:
Ensure the dataset and splits are correct before any training.

In [None]:
import sys
from pathlib import Path

ROOT = Path("..").resolve()  

sys.path.insert(0, str(ROOT))
print("ROOT:", ROOT)

ROOT: C:\Users\daois\Documents\Rizq\DeepLearning


In [3]:
from pathlib import Path
import yaml

from src.data.io import load_csv
from src.data.splits import freeze_train_val_split, load_frozen_split
from src.data.checks import run_sanity_checks_train_val

cfg = yaml.safe_load((ROOT / "configs/data.yaml").read_text())

RAW = ROOT / cfg["paths"]["raw_train_csv"]
SPLIT_JSON = ROOT / cfg["paths"]["split_json"]
PROCESSED_DIR = ROOT / cfg["paths"]["processed_dir"]

df = load_csv(RAW)
df.head(), df.shape

(   label                                              title  \
 0      3  Wall St. Bears Claw Back Into the Black (Reuters)   
 1      3  Carlyle Looks Toward Commercial Aerospace (Reu...   
 2      3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
 3      3  Iraq Halts Oil Exports from Main Southern Pipe...   
 4      3  Oil prices soar to all-time record, posing new...   
 
                                          description  
 0  Reuters - Short-sellers, Wall Street's dwindli...  
 1  Reuters - Private investment firm Carlyle Grou...  
 2  Reuters - Soaring crude prices plus worries\ab...  
 3  Reuters - Authorities have halted oil export\f...  
 4  AFP - Tearaway world oil prices, toppling reco...  ,
 (120000, 3))

In [None]:
if not SPLIT_JSON.exists():
    split_obj = freeze_train_val_split(
        df_train=df,
        out_split_json=SPLIT_JSON,
        out_processed_dir=PROCESSED_DIR,
        seed=cfg["data"]["seed"],
        train_ratio=cfg["data"]["split"]["train_ratio"],
        val_ratio=cfg["data"]["split"]["val_ratio"],
        label_col=cfg["data"]["split"]["stratify_col"], 
    )
else:
    split_obj = load_frozen_split(SPLIT_JSON)

split_obj["meta"]

{'seed': 42,
 'strategy': 'stratified_train_val',
 'train_ratio': 0.9,
 'val_ratio': 0.1,
 'n_total': 120000}

In [6]:
report = run_sanity_checks_train_val(df, split_obj)
report.ok, report.details

(True,
 {'n_total': 120000,
  'n_train': 108000,
  'n_val': 12000,
  'missing': {'label': 0, 'title': 0, 'description': 0},
  'label_dist': {'total': {1: 30000, 2: 30000, 3: 30000, 4: 30000},
   'train': {1: 27000, 2: 27000, 3: 27000, 4: 27000},
   'val': {1: 3000, 2: 3000, 3: 3000, 4: 3000}},
  'duplicate_rows_total': 86,
  'leakage_exact_train_val': 11,
  'leakage_same_title_train_val': 944})