In [11]:
!pip install numpy pandas scikit-learn




[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:

# # E2E Dataset Pipeline — Notebook (dengan Akselerasi CUDA)
#
# Versi notebook yang sama seperti sebelumnya namun **tidak menggunakan `def` atau function**.
# Semua langkah ditulis sebagai blok kode top-level yang bisa dijalankan berurutan.
#
# ## Perubahan
# - Ditambahkan flag `USE_CUDA` untuk beralih antara eksekusi CPU (Scikit-learn) dan GPU (RAPIDS cuDF & cuML).
# - Pustaka RAPIDS akan digunakan untuk mempercepat pemrosesan data dan training model jika `USE_CUDA = True` dan environment-nya mendukung.
# - Pastikan Anda telah menginstal RAPIDS. Cara termudah adalah menggunakan Conda:
# ```bash
# # conda create -n rapids -c rapidsai -c conda-forge -c nvidia rapids=23.10 python=3.10 cudatoolkit=11.8
# # conda activate rapids
# ```

# %%
# 1) Konfigurasi — sesuaikan path sebelum menjalankan sel-sel berikut
DATA_DIR = "Data"      # folder berisi C01..C10
BASE_DIR = "."         # tempat output/artifact akan ditulis (output/, models/, reports/, ...)
LABEL_COL = "label"    # nama kolom label di file train*.csv
TIME_SERIES = False    # True jika dataset adalah time-series

# --- TAMBAHAN: Konfigurasi CUDA ---
# Ubah menjadi True jika Anda ingin menggunakan akselerasi GPU dengan RAPIDS
USE_CUDA = True

print("DATA_DIR:", DATA_DIR)
print("BASE_DIR:", BASE_DIR)
print("LABEL_COL:", LABEL_COL)
print("TIME_SERIES:", TIME_SERIES)
print(f"USE_CUDA: {USE_CUDA}")

# %%
# 2) Imports & dependensi (dengan tambahan untuk CUDA)
from __future__ import annotations
import os
import re
import json
import glob
import textwrap
import math

# --- Pustaka CPU (Pandas, Scikit-learn) ---
import numpy as np
import pandas as pd
from dateutil import parser as dateparser
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit, train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load
import pickle # Diperlukan untuk menyimpan model cuML

# --- TAMBAHAN: Import pustaka GPU (RAPIDS) jika USE_CUDA aktif ---
if USE_CUDA:
    try:
        import cudf
        import cupy as cp
        from cuml.model_selection import train_test_split as cuml_train_test_split
        from cuml.metrics import f1_score as cuml_f1_score, confusion_matrix as cuml_confusion_matrix
        from cuml.preprocessing import StandardScaler as cuml_StandardScaler
        from cuml.impute import SimpleImputer as cuml_SimpleImputer
        from cuml.linear_model import LogisticRegression as cuml_LogisticRegression
        from cuml.ensemble import RandomForestClassifier as cuml_RandomForestClassifier
        print("✅ Pustaka RAPIDS (cuDF, cuML) berhasil diimpor.")
    except ImportError:
        print("⚠️ Peringatan: Pustaka RAPIDS tidak ditemukan. Mengubah USE_CUDA menjadi False.")
        USE_CUDA = False
else:
    print("Eksekusi menggunakan CPU (Pandas, Scikit-learn).")

# Optional imports for extraction
try:
    import cv2
except Exception:
    cv2 = None

try:
    import pytesseract
    from PIL import Image
except Exception:
    pytesseract = None
    Image = None

LABEL_MAP = {"Low": 0, "Medium": 1, "High": 2, 0: 0, 1: 1, 2: 2}

print("cv2 available:", cv2 is not None)
print("pytesseract available:", pytesseract is not None)


# %%
# 3) Siapkan direktori artifact (tanpa fungsi)
paths = {
    "output": os.path.join(BASE_DIR, "output"),
    "extracted": os.path.join(BASE_DIR, "output", "extracted"),
    "models": os.path.join(BASE_DIR, "models"),
    "reports": os.path.join(BASE_DIR, "reports"),
    "predictions": os.path.join(BASE_DIR, "predictions"),
    "logs": os.path.join(BASE_DIR, "logs"),
    "config": os.path.join(BASE_DIR, "config"),
}
for p in paths.values():
    os.makedirs(p, exist_ok=True)

print("Artifact directories ensured:")
for k, v in paths.items():
    print(f" - {k}: {v}")

# %%
# 4) Tulis templates (README, contoh config, evaluation report)
# (Tidak ada perubahan di sel ini)
readme = textwrap.dedent(
    """
    # Project README (Template)

    ## Tujuan
    - Menghasilkan `predictions/test_predictions.csv` dengan format label numerik (Low=0, Medium=1, High=2).
    - Artefak wajib:
      - `output/extracted_master.csv`
      - `output/master_imputed.csv`
      - `models/best_model.joblib` atau `models/best_model.pkl`
      - `reports/evaluation_report.txt`
      - `logs/failed_images.csv`
    """
).strip()

readme_path = os.path.join(BASE_DIR, "README_TEMPLATE.md")
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme)

eval_template = textwrap.dedent(
    """
    EVALUATION REPORT (Auto-generated)
    =================================
    [Macro F1]
    {macro_f1}
    [Per-class Metrics]
    {classification_report}
    [Confusion Matrix]
    {confusion_matrix}
    """
).strip()
eval_path = os.path.join(paths["reports"], "evaluation_report.txt")
with open(eval_path, "w", encoding="utf-8") as f:
    f.write(eval_template)

cfg = { "indicator_configs": { "C01": { "has_legend": True } } }
cfg_path = os.path.join(paths["config"], "indicator_configs.json")
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(cfg, f, indent=2)

print("Templates & config written:", readme_path, eval_path, cfg_path)


# %%
# 5) Inventory (scan Data/C* untuk gambar & csv)
# (Tidak ada perubahan di sel ini)
rows = []
for cdir in sorted(glob.glob(os.path.join(DATA_DIR, "C*"))):
    indicator = os.path.basename(cdir)
    imgs = glob.glob(os.path.join(cdir, "*.png")) + glob.glob(os.path.join(cdir, "*.jpg"))
    train_csvs = glob.glob(os.path.join(cdir, "train*.csv"))
    test_csvs = glob.glob(os.path.join(cdir, "test*.csv"))
    for p in imgs:
        rows.append({"indicator": indicator, "type": "image", "path": p})
    for p in train_csvs:
        rows.append({"indicator": indicator, "type": "train_csv", "path": p})
    for p in test_csvs:
        rows.append({"indicator": indicator, "type": "test_csv", "path": p})

inv = pd.DataFrame(rows)
inv_path = os.path.join(paths["output"], "inventory.csv")
inv.to_csv(inv_path, index=False)
print("Inventory written:", inv_path)
print(inv.head())

# %%
# 6) Extraction (inline, best-effort).
# (Tidak ada perubahan di sel ini, karena ekstraksi gambar tetap di CPU)
failed_rows = []
extracted_files = []
for cdir in sorted(glob.glob(os.path.join(DATA_DIR, "C*"))):
    indicator = os.path.basename(cdir)
    imgs = sorted(glob.glob(os.path.join(cdir, "*.png")) + glob.glob(os.path.join(cdir, "*.jpg")))
    all_rows = []
    for img_path in imgs:
        m = re.search(r"(19|20)\d{2}", os.path.basename(img_path))
        year = int(m.group(0)) if m else None
        date_iso = f"{year:04d}-01-01" if year else None

        # Placeholder extraction
        val = np.random.rand() * 100
        status = "warning"
        note = "placeholder_extraction"

        all_rows.append({
            "source_image": img_path, "indicator": indicator, "date": date_iso,
            "series_name": "series_1", "value": val, "extraction_status": status, "note": note,
        })

    if all_rows:
        out_path = os.path.join(paths["extracted"], f"{indicator}_extracted.csv")
        pd.DataFrame(all_rows).to_csv(out_path, index=False)
        extracted_files.append(out_path)
print("Extraction complete.")

# %%
# 7) Merge extracted → master (normalisasi tanggal)
# (Tidak ada perubahan di sel ini)
files = sorted(glob.glob(os.path.join(paths["extracted"], "*_extracted.csv")))
if not files:
    master = pd.DataFrame(columns=["indicator", "date", "value"])
else:
    dfs = [pd.read_csv(f) for f in files]
    master = pd.concat(dfs, ignore_index=True)
    master['date'] = pd.to_datetime(master['date'], errors='coerce').dt.strftime('%Y-%m-%d')

master_path = os.path.join(paths["output"], "extracted_master.csv")
master.to_csv(master_path, index=False)
print("Master extracted written:", master_path)

# %%
# 8) Impute (pivot → iterative imputer) — menulis master_imputed.csv
# (Tidak ada perubahan di sel ini, imputasi kompleks tetap di CPU)
df = pd.read_csv(master_path)
if df.empty:
    pd.DataFrame().to_csv(os.path.join(paths["output"], "master_imputed.csv"))
    print("Master empty, skipping imputation.")
else:
    df_agg = df.groupby(["indicator", "date"])['value'].mean().reset_index()
    pivot = df_agg.pivot(index="date", columns="indicator", values="value").sort_index()
    for c in pivot.columns:
        pivot[f"{c}_missing_flag"] = pivot[c].isna().astype(int)

    num_cols = [c for c in pivot.columns if not c.endswith("_missing_flag")]
    if not pivot.empty and len(num_cols) > 0:
        imputer = IterativeImputer(random_state=42, max_iter=10)
        pivot[num_cols] = imputer.fit_transform(pivot[num_cols])

    out_imputed = os.path.join(paths["output"], "master_imputed.csv")
    pivot.to_csv(out_imputed)
    print("Imputed master written:", out_imputed)


# %%
# 9) Feature engineering (lags, rolling, temporal)
imputed_path = os.path.join(paths["output"], "master_imputed.csv")
if not os.path.exists(imputed_path):
    print("No imputed file available — skipping feature engineering.")
else:
    # --- MODIFIKASI: Pilih antara cuDF (GPU) atau Pandas (CPU) ---
    if USE_CUDA:
        print("Running Feature Engineering on GPU with cuDF...")
        df = cudf.read_csv(imputed_path)
        df['date'] = cudf.to_datetime(df['date'], errors='coerce')
        df = df.dropna(subset=['date']).sort_values('date')

        feat = df.copy()
        feat['year'] = feat['date'].dt.year
        feat['month'] = feat['date'].dt.month
        feat['quarter'] = feat['date'].dt.quarter

        value_cols = [c for c in feat.columns if c not in ['date', 'year', 'month', 'quarter']]
        lags = [1, 3, 6]
        for c in value_cols:
            for L in lags:
                feat[f"{c}_lag_{L}"] = feat[c].shift(L)
            feat[f"{c}_rolling_mean_3"] = feat[c].rolling(3).mean()
            feat[f"{c}_rolling_std_3"] = feat[c].rolling(3).std()

        # cuDF tidak punya pct_change, kita hitung manual
        for c in value_cols:
             for p in [1, 3]:
                shifted = feat[c].shift(p)
                feat[f'{c}_pct_change_{p}'] = (feat[c] - shifted) / shifted

        # Tulis ke file dari GPU
        features_path = os.path.join(paths["output"], "features_engineered.csv")
        feat.to_csv(features_path, index=False)
        print("Features engineered (GPU) written:", features_path)

    else:
        print("Running Feature Engineering on CPU with Pandas...")
        df = pd.read_csv(imputed_path)
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df = df.dropna(subset=['date']).sort_values('date')

        feat = df.copy()
        feat['year'] = feat['date'].dt.year
        feat['month'] = feat['date'].dt.month
        feat['quarter'] = feat['date'].dt.quarter

        value_cols = [c for c in feat.columns if c not in ['date', 'year', 'month', 'quarter']]
        lags = [1, 3, 6]
        for c in value_cols:
            for L in lags:
                feat[f"{c}_lag_{L}"] = feat[c].shift(L)
            feat[f"{c}_rolling_mean_3"] = feat[c].rolling(3).mean()
            feat[f"{c}_rolling_std_3"] = feat[c].rolling(3).std()
            feat[f"{c}_pct_change_1"] = feat[c].pct_change(1)
            feat[f"{c}_pct_change_3"] = feat[c].pct_change(3)

        features_path = os.path.join(paths["output"], "features_engineered.csv")
        feat.to_csv(features_path, index=False)
        print("Features engineered (CPU) written:", features_path)

# %%
# 10) Load training targets & align with features
train_files = sorted(glob.glob(os.path.join(DATA_DIR, "C*", "train*.csv")))

if not train_files:
    print("No train*.csv found. Cannot train.")
else:
    # --- MODIFIKASI: Pilih antara cuDF (GPU) atau Pandas (CPU) ---
    if USE_CUDA:
        print("Loading and aligning data on GPU with cuDF...")
        trains = [cudf.read_csv(p) for p in train_files]
        train_df = cudf.concat(trains, ignore_index=True)
        feat = cudf.read_csv(os.path.join(paths["output"], "features_engineered.csv"))
    else:
        print("Loading and aligning data on CPU with Pandas...")
        trains = [pd.read_csv(p) for p in train_files]
        train_df = pd.concat(trains, ignore_index=True)
        feat = pd.read_csv(os.path.join(paths["output"], "features_engineered.csv"))

    if LABEL_COL in train_df.columns:
        # Peta label tetap di CPU karena sederhana, lalu pindahkan ke GPU jika perlu
        mapped_labels = train_df[LABEL_COL].to_pandas().map(LABEL_MAP).values
        if USE_CUDA:
            train_df[LABEL_COL] = cudf.Series(mapped_labels)
        else:
            train_df[LABEL_COL] = mapped_labels
    else:
        raise KeyError(f"Label column '{LABEL_COL}' not found.")

    common_key = 'date'
    if common_key not in feat.columns or common_key not in train_df.columns:
         print(f"No common key '{common_key}' found. Cannot merge.")
    else:
        # Melakukan merge
        if USE_CUDA:
             merged = cudf.merge(train_df, feat, on=common_key, how='inner')
        else:
             merged = pd.merge(train_df, feat, on=common_key, how='inner')

        if merged.empty:
            print("Merging features with training yielded empty set.")
        else:
            y = merged[LABEL_COL]
            X = merged.drop(columns=[LABEL_COL])
            print("Aligned X, y shapes:", X.shape, y.shape)

            # %%
            # 11) Train & evaluate
            if USE_CUDA:
                print("Training and evaluating on GPU with cuML...")
                num_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

                # Preprocessing manual untuk cuML (tanpa pipeline)
                imputer = cuml_SimpleImputer(strategy="median")
                scaler = cuml_StandardScaler(with_mean=False)
                X_num_imputed = imputer.fit_transform(X[num_cols])
                X_processed = scaler.fit_transform(X_num_imputed)

                candidates = {
                    "logreg": cuml_LogisticRegression(class_weight='balanced'),
                    "rf": cuml_RandomForestClassifier(n_estimators=300, max_depth=16, random_state=42)
                }

                best_score = -1.0
                best_name = None
                splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

                for name, clf in candidates.items():
                    cv_scores = []
                    for train_idx, val_idx in splitter.split(X_processed.get(), y.get()): # .get() untuk konversi ke numpy
                        X_tr, X_va = X_processed[train_idx], X_processed[val_idx]
                        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

                        clf.fit(X_tr, y_tr)
                        preds = clf.predict(X_va)
                        cv_scores.append(cuml_f1_score(y_va, preds, average='macro'))

                    score = float(cp.mean(cp.array(cv_scores)))
                    if score > best_score:
                        best_score = score
                        best_name = name

                print(f"Best model on GPU: {best_name} with CV F1-score: {best_score:.4f}")

                # Latih model terbaik pada data penuh dan simpan
                best_model = candidates[best_name]
                best_model.fit(X_processed, y)
                model_path = os.path.join(paths["models"], "best_model.pkl")
                with open(model_path, "wb") as f:
                    pickle.dump(best_model, f)
                print("Best model (cuML) saved:", model_path)

                # Evaluasi holdout
                X_tr, X_te, y_tr, y_te = cuml_train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)
                best_model.fit(X_tr, y_tr)
                pred = best_model.predict(X_te)
                macro = cuml_f1_score(y_te, pred, average='macro')
                cm = cuml_confusion_matrix(y_te, pred)
                # Classification report tidak ada di cuML, kita buat dari scikit-learn
                cls_rep = classification_report(y_te.get(), pred.get(), digits=4)

            else:
                print("Training and evaluating on CPU with Scikit-learn...")
                num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
                cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

                pre = ColumnTransformer(transformers=[
                    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler(with_mean=False))]), num_cols),
                    ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent"))]), cat_cols)
                ])

                candidates = {
                    "logreg": LogisticRegression(max_iter=200, class_weight='balanced'),
                    "rf": RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42, class_weight='balanced')
                }

                best_pipe = None
                best_score = -1.0
                best_name = None
                splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

                for name, clf in candidates.items():
                    pipe = Pipeline([("pre", pre), ("clf", clf)])
                    cv_scores = []
                    for train_idx, val_idx in splitter.split(X, y):
                        X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
                        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
                        pipe.fit(X_tr, y_tr)
                        preds = pipe.predict(X_va)
                        cv_scores.append(f1_score(y_va, preds, average='macro'))
                    score = np.mean(cv_scores)
                    if score > best_score:
                        best_score, best_name, best_pipe = score, name, pipe

                print(f"Best model on CPU: {best_name} with CV F1-score: {best_score:.4f}")

                best_pipe.fit(X, y)
                model_path = os.path.join(paths["models"], "best_model.joblib")
                dump(best_pipe, model_path)
                print("Best model (scikit-learn) saved:", model_path)

                X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
                best_pipe.fit(X_tr, y_tr)
                pred = best_pipe.predict(X_te)
                macro = f1_score(y_te, pred, average='macro')
                cls_rep = classification_report(y_te, pred, digits=4)
                cm = confusion_matrix(y_te, pred)

            # Tulis laporan (hasil dikonversi ke format CPU jika perlu)
            report_cm = cm.get() if USE_CUDA else cm
            report_macro = float(macro) if not USE_CUDA else macro.item()
            report_best_name = best_name

            report = textwrap.dedent(f"""
            EVALUATION REPORT (Auto-generated on {'GPU' if USE_CUDA else 'CPU'})
            ====================================================
            Best Model: {report_best_name}
            CV Macro F1 (mean): {best_score:.4f}
            Holdout Macro F1: {report_macro:.4f}
            Per-class metrics:
            {cls_rep}
            Confusion matrix:
            {report_cm}
            """).strip()
            with open(os.path.join(paths["reports"], "evaluation_report.txt"), "w", encoding="utf-8") as f:
                f.write(report)
            print("Evaluation report written.")

            # %%
            # 12) Build submission
            test_files = sorted(glob.glob(os.path.join(DATA_DIR, "C*", "test*.csv")))
            if not test_files:
                print("No test*.csv found. Skipping prediction.")
            else:
                if USE_CUDA:
                    print("Building submission on GPU...")
                    # Muat model cuML, imputer, dan scaler
                    with open(model_path, 'rb') as f: model = pickle.load(f)

                    tests = [cudf.read_csv(p) for p in test_files]
                    test_df = cudf.concat(tests, ignore_index=True)
                    merged_test = cudf.merge(test_df, feat, on=common_key, how='left')

                    X_test_num = merged_test[num_cols]
                    X_test_imputed = imputer.transform(X_test_num)
                    X_test_processed = scaler.transform(X_test_imputed)

                    preds = model.predict(X_test_processed)

                    sub = merged_test[["id"]].copy() if "id" in merged_test.columns else cudf.DataFrame()
                    sub["label"] = preds
                    # Konversi ke pandas sebelum menyimpan
                    sub = sub.to_pandas()

                else:
                    print("Building submission on CPU...")
                    model = load(model_path)

                    tests = [pd.read_csv(p) for p in test_files]
                    test_df = pd.concat(tests, ignore_index=True)
                    merged_test = pd.merge(test_df, feat, on=common_key, how='left')

                    # Scikit-learn pipeline menangani preprocessing secara otomatis
                    preds = model.predict(merged_test)

                    sub = merged_test[["id"]].copy() if "id" in merged_test.columns else pd.DataFrame()
                    sub["label"] = preds

                out_sub = os.path.join(paths["predictions"], "test_predictions.csv")
                sub.to_csv(out_sub, index=False)
                print("Submission written:", out_sub)

# # Catatan Akhir
# - Notebook ini sekarang memiliki dua jalur eksekusi: satu untuk CPU dengan `pandas` dan `scikit-learn`, dan satu lagi untuk GPU dengan `cudf` dan `cuml`. Anda dapat mengontrolnya dengan flag `USE_CUDA` di sel pertama.
# - Perhatikan bahwa model yang dilatih di GPU (`.pkl`) tidak kompatibel dengan yang dilatih di CPU (`.joblib`), dan sebaliknya.
# - Preprocessing untuk cuML dilakukan secara manual (imputer lalu scaler) karena `cuml` tidak memiliki abstraksi `Pipeline` atau `ColumnTransformer` yang setara dengan `scikit-learn`.
# - Jika Anda mendapatkan `ImportError` untuk `cudf` atau `cuml`, pastikan Anda telah menginstal RAPIDS dengan benar di environment conda Anda dan menjalankannya dari sana.

DATA_DIR: Data
BASE_DIR: .
LABEL_COL: label
TIME_SERIES: False
USE_CUDA: True
⚠️ Peringatan: Pustaka RAPIDS tidak ditemukan. Mengubah USE_CUDA menjadi False.
cv2 available: False
pytesseract available: False
Artifact directories ensured:
 - output: .\output
 - extracted: .\output\extracted
 - models: .\models
 - reports: .\reports
 - predictions: .\predictions
 - logs: .\logs
 - config: .\config
Templates & config written: .\README_TEMPLATE.md .\reports\evaluation_report.txt .\config\indicator_configs.json
Inventory written: .\output\inventory.csv
  indicator       type                    path
0       C01  train_csv  Data\C01\train C01.csv
1       C01   test_csv   Data\C01\test C01.csv
2       C02  train_csv  Data\C02\train C02.csv
3       C02   test_csv   Data\C02\test C02.csv
4       C03  train_csv  Data\C03\train C03.csv
Extraction complete.
Master extracted written: .\output\extracted_master.csv
Master empty, skipping imputation.
Running Feature Engineering on CPU with Pandas...


KeyError: 'date'