In [2]:
#!/usr/bin/env python3
"""
01_data_checking.py
Initial data checking / assessment for CSV datasets:
- rows/cols
- column types
- missing values (count + %)
- duplicates
- basic descriptive stats (numeric + categorical)
- optional: save report files to ./outputs/

Usage examples:
  python 01_data_checking.py student_combined_data.csv
  python 01_data_checking.py student_performance_data.csv student_aptitude_data.csv
  python 01_data_checking.py --save student_combined_data.csv
  python 01_data_checking.py --save --outdir reports student_combined_data.csv
"""

from __future__ import annotations

import argparse
import os
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd


def safe_read_csv(path: Path) -> pd.DataFrame:
    """Read CSV with a couple of safe fallbacks."""
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        # common fallback
        return pd.read_csv(path, encoding="latin-1")


def summarize_df(df: pd.DataFrame) -> Dict[str, Any]:
    n_rows, n_cols = df.shape

    # missing
    missing_count = df.isna().sum()
    missing_pct = (missing_count / max(n_rows, 1) * 100).round(2)

    missing_table = (
        pd.DataFrame({"missing_count": missing_count, "missing_pct": missing_pct})
        .sort_values(["missing_count", "missing_pct"], ascending=False)
    )

    # duplicates
    dup_rows = int(df.duplicated().sum())

    # dtypes
    dtype_table = pd.DataFrame({"dtype": df.dtypes.astype(str)}).reset_index().rename(columns={"index": "column"})

    # stats
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in df.columns if c not in numeric_cols]

    numeric_desc = df[numeric_cols].describe().T if numeric_cols else pd.DataFrame()
    categorical_desc = df[cat_cols].describe().T if cat_cols else pd.DataFrame()

    # unique counts (useful for "karakteristik data")
    nunique = df.nunique(dropna=True).sort_values(ascending=False)
    nunique_table = pd.DataFrame({"nunique": nunique}).reset_index().rename(columns={"index": "column"})

    return {
        "shape": (n_rows, n_cols),
        "dup_rows": dup_rows,
        "missing_table": missing_table,
        "dtype_table": dtype_table,
        "nunique_table": nunique_table,
        "numeric_desc": numeric_desc,
        "categorical_desc": categorical_desc,
        "numeric_cols": numeric_cols,
        "cat_cols": cat_cols,
    }


def print_header(title: str) -> None:
    print("\n" + "=" * 80)
    print(title)
    print("=" * 80)


def print_compact_table(df: pd.DataFrame, max_rows: int = 30) -> None:
    if df.empty:
        print("(empty)")
        return
    if len(df) > max_rows:
        print(df.head(max_rows).to_string())
        print(f"... ({len(df) - max_rows} rows omitted)")
    else:
        print(df.to_string())


def ensure_outdir(outdir: Path) -> None:
    outdir.mkdir(parents=True, exist_ok=True)


def save_reports(summary: Dict[str, Any], dataset_name: str, outdir: Path) -> None:
    """
    Save summary tables to csv and a text report.
    """
    ensure_outdir(outdir)

    # Save tables
    summary["missing_table"].to_csv(outdir / f"{dataset_name}__missing.csv")
    summary["dtype_table"].to_csv(outdir / f"{dataset_name}__dtypes.csv", index=False)
    summary["nunique_table"].to_csv(outdir / f"{dataset_name}__nunique.csv", index=False)

    if not summary["numeric_desc"].empty:
        summary["numeric_desc"].to_csv(outdir / f"{dataset_name}__numeric_desc.csv")
    if not summary["categorical_desc"].empty:
        summary["categorical_desc"].to_csv(outdir / f"{dataset_name}__categorical_desc.csv")

    # Save a human-readable txt report
    report_path = outdir / f"{dataset_name}__report.txt"
    with report_path.open("w", encoding="utf-8") as f:
        n_rows, n_cols = summary["shape"]
        f.write(f"Dataset: {dataset_name}\n")
        f.write(f"Shape: {n_rows} rows x {n_cols} cols\n")
        f.write(f"Duplicate rows: {summary['dup_rows']}\n\n")

        f.write("=== Dtypes ===\n")
        f.write(summary["dtype_table"].to_string(index=False))
        f.write("\n\n=== Unique counts (top) ===\n")
        f.write(summary["nunique_table"].head(50).to_string(index=False))
        f.write("\n\n=== Missing values (sorted) ===\n")
        f.write(summary["missing_table"].head(100).to_string())
        f.write("\n")

        if not summary["numeric_desc"].empty:
            f.write("\n=== Numeric describe ===\n")
            f.write(summary["numeric_desc"].to_string())
            f.write("\n")

        if not summary["categorical_desc"].empty:
            f.write("\n=== Categorical describe ===\n")
            f.write(summary["categorical_desc"].to_string())
            f.write("\n")


def main(paths: List[str], save: bool, outdir: str) -> None:
    outdir_path = Path(outdir)

    for p in paths:
        path = Path(p)
        if not path.exists():
            print_header(f"[SKIP] File not found: {p}")
            continue

        dataset_name = path.stem

        print_header(f"DATASET: {path.name}")
        df = safe_read_csv(path)

        # basic preview
        print("Preview (first 5 rows):")
        print(df.head(5).to_string(index=False))

        summary = summarize_df(df)

        # 1) jumlah
        n_rows, n_cols = summary["shape"]
        print_header("1) JUMLAH DATA")
        print(f"Rows: {n_rows}")
        print(f"Columns: {n_cols}")

        # 2) karakteristik
        print_header("2) KARAKTERISTIK DATA")
        print(f"Numeric cols: {len(summary['numeric_cols'])} -> {summary['numeric_cols'][:15]}" + (" ..." if len(summary["numeric_cols"]) > 15 else ""))
        print(f"Categorical/other cols: {len(summary['cat_cols'])} -> {summary['cat_cols'][:15]}" + (" ..." if len(summary["cat_cols"]) > 15 else ""))

        print("\nDtypes:")
        print_compact_table(summary["dtype_table"], max_rows=80)

        print("\nUnique counts per column (top 30):")
        print_compact_table(summary["nunique_table"].head(30), max_rows=30)

        # 3) missing values
        print_header("3) MISSING VALUES")
        missing_nonzero = summary["missing_table"][summary["missing_table"]["missing_count"] > 0]
        if missing_nonzero.empty:
            print("No missing values found âœ…")
        else:
            print("Columns with missing values (top 50):")
            print_compact_table(missing_nonzero.head(50), max_rows=50)

        # 4) duplicates
        print_header("4) DUPLICATE ROWS")
        print(f"Duplicate rows: {summary['dup_rows']}")

        # 5) basic stats
        print_header("5) RINGKASAN STATISTIK")
        if summary["numeric_desc"].empty:
            print("No numeric columns detected.")
        else:
            print("Numeric describe (top 30 rows):")
            print_compact_table(summary["numeric_desc"].head(30), max_rows=30)

        if summary["categorical_desc"].empty:
            print("\nNo categorical/object columns detected.")
        else:
            print("\nCategorical describe (top 30 rows):")
            print_compact_table(summary["categorical_desc"].head(30), max_rows=30)

        # save outputs
        if save:
            print_header("SAVING REPORTS")
            save_reports(summary, dataset_name, outdir_path)
            print(f"Saved to: {outdir_path.resolve()}")

    print("\nDone.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Initial data checking / assessment for CSV files.")
    parser.add_argument(
        "csv_paths",
        nargs="*",
        help="Paths to CSV files. If empty, will try common defaults in current folder.",
    )
    parser.add_argument("--save", action="store_true", help="Save reports to outdir.")
    parser.add_argument("--outdir", default="outputs", help="Output directory for saved reports (default: outputs).")

    args, unknown = parser.parse_known_args()

    default_candidates = [
        "student_combined_data.csv",
        "student_performance_data.csv",
        "student_aptitude_data.csv",
    ]

    paths = args.csv_paths if args.csv_paths else default_candidates
    main(paths=paths, save=args.save, outdir=args.outdir)



DATASET: kernel-2190e5de-075c-47a8-a2af-667b321e4597.json
Preview (first 5 rows):
  {
NaN
NaN
NaN
NaN
NaN

1) JUMLAH DATA
Rows: 12
Columns: 1

2) KARAKTERISTIK DATA
Numeric cols: 1 -> ['{']
Categorical/other cols: 0 -> []

Dtypes:
  column    dtype
0      {  float64

Unique counts per column (top 30):
  column  nunique
0      {        0

3) MISSING VALUES
Columns with missing values (top 50):
   missing_count  missing_pct
{             12        100.0

4) DUPLICATE ROWS
Duplicate rows: 11

5) RINGKASAN STATISTIK
Numeric describe (top 30 rows):
   count  mean  std  min  25%  50%  75%  max
{    0.0   NaN  NaN  NaN  NaN  NaN  NaN  NaN

No categorical/object columns detected.

Done.
