# 01 - Data Audit

This notebook validates schema, quality, and coverage for the source CSV.

In [None]:
from pathlib import Path
import sys

import pandas as pd

PROJECT_ROOT = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd().resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import EXPECTED_COLUMNS, RAW_DATA_PATH

df = pd.read_csv(RAW_DATA_PATH)
missing = [col for col in EXPECTED_COLUMNS if col not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

df.head()

In [None]:
print(f"Rows: {len(df):,}")
print(f"Columns: {df.shape[1]}")
print(df.dtypes.head(20))

In [None]:
missing = df.isna().sum().sort_values(ascending=False)
duplicate_match_ids = df["match_id"].duplicated().sum() if "match_id" in df.columns else None

print(f"Duplicate match_id count: {duplicate_match_ids}")
missing[missing > 0].head(20)

In [None]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")
summary = {
    "min_date": df["date"].min(),
    "max_date": df["date"].max(),
    "teams": df[["team1", "team2"]].stack().nunique(),
    "venues": df["venue"].nunique(),
    "stages": df["match_stage"].nunique(),
}
summary