<a href="https://colab.research.google.com/github/elangbijak4/Adaptive-Stratified-Sampling---ASS/blob/main/Demo2_ASS_menggunakan_TabNet_untuk_mengadaptasi_strata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# ====================================================================
# INSTALASI DAN IMPORT (Langkah Awal di Colab)
# Pastikan Runtime menggunakan GPU jika Anda bekerja dengan data besar
# ====================================================================
!pip install pytorch-tabnet numpy pandas scikit-learn

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# PERBAIKAN DI SINI: Impor dari lokasi yang benar
from pytorch_tabnet.tab_model import TabNetRegressor

from pytorch_tabnet.metrics import Metric
import torch



In [14]:
# Mengatur seed untuk reproduksi
np.random.seed(42)
torch.manual_seed(42)

# ====================================================================
# 1. SIMULASI POPULASI DAN STRATIFIKASI
# ====================================================================
print("1. SIMULASI DATA DAN STRATIFIKASI AWAL")

# Simulasi Data Tabular
POP_SIZE = 5000
data = {
    'Region': np.random.choice(['East', 'West', 'Central'], POP_SIZE),
    'Age': np.random.randint(20, 60, POP_SIZE),
    'Income': np.random.normal(50000, 15000, POP_SIZE),
    'Feature_A': np.random.rand(POP_SIZE),
    # Target (Y): Dipengaruhi Region dan Varians (Varians tinggi di 'Central')
    'Target_Y': np.random.normal(50 + 10 * (np.random.choice([0, 1, 2], POP_SIZE) == 2) * 5,
                                 (np.random.choice([1, 2, 3], POP_SIZE) == 3) * 20 + 5, POP_SIZE)
}
df_populasi = pd.DataFrame(data)

# Stratifikasi berdasarkan 'Region'
cat_features = ['Region']
for feature in cat_features:
    df_populasi[feature] = LabelEncoder().fit_transform(df_populasi[feature])

# Pisahkan Fitur (X) dan Target (Y)
X = df_populasi.drop('Target_Y', axis=1).values
Y = df_populasi['Target_Y'].values.reshape(-1, 1)

# Daftar ID seluruh populasi
all_indices = np.arange(POP_SIZE)

# ====================================================================
# 2. PENGAMBILAN SAMPEL AWAL (N) DAN PELATIHAN BASELINE
# ====================================================================
print("\n2. PENGAMBILAN SAMPEL AWAL (N)")

# N_initial: 2% dari populasi
N_initial_size = int(0.02 * POP_SIZE)

# Ambil sampel awal secara acak (memastikan N_initial_size per strata jika memungkinkan, tapi di sini disederhanakan)
initial_indices = np.random.choice(all_indices, size=N_initial_size, replace=False)

X_N, Y_N = X[initial_indices], Y[initial_indices]

# Pisahkan N_initial menjadi training dan validation
X_train_N, X_val_N, y_train_N, y_val_N = train_test_split(X_N, Y_N, test_size=0.3, random_state=42)

# Inisialisasi dan Latih TabNet Awal (Baseline)
model_baseline = TabNetRegressor()
model_baseline.fit(
    X_train_N, y_train_N,
    eval_set=[(X_val_N, y_val_N)],
    patience=50,
    max_epochs=100
)
print(f"  Jumlah Sampel Awal (N): {N_initial_size}")
print(f"  Loss Baseline (val_0): {model_baseline.history['val_0_mse'][-1]:.4f}")

# ====================================================================
# 3. ANALISIS VARIANSI / EFISIENSI (Menghitung Loss per Strata)
# ====================================================================
print("\n3. ANALISIS VARIANSI (Mengukur Loss per Strata)")

# Gunakan SISA POPULASI untuk menguji ketidakpastian model saat ini
remaining_indices = np.array(list(set(all_indices) - set(initial_indices)))
X_rem, Y_rem = X[remaining_indices], Y[remaining_indices]
df_rem = df_populasi.iloc[remaining_indices].copy()

# Prediksi pada SISA POPULASI
preds_rem = model_baseline.predict(X_rem)
df_rem['Prediction_Error'] = (preds_rem - Y_rem)**2 # Menggunakan MSE sebagai proxy Loss/Varians

# Hitung Rata-Rata Loss (Varians Prediksi) per Strata ('Region')
strata_loss = df_rem.groupby('Region')['Prediction_Error'].mean()
total_loss = strata_loss.sum()

print("  Rata-rata Loss Model Baseline di Sisa Populasi (per Strata):")
print(strata_loss.to_string())

# ====================================================================
# 4. KALIBRASI ADAPTIF (Alokasi Sampel Tambahan M)
# ====================================================================
print("\n4. KALIBRASI ADAPTIF (Alokasi Sampel M)")

# Total Budget Sampel Tambahan (M)
M_budget = int(0.05 * POP_SIZE) # 5% dari total populasi
print(f"  Total Budget Sampel Tambahan (M): {M_budget}")

additional_allocation = {}
print("  Alokasi M berdasarkan Loss (M ~ Loss):")

for region, loss in strata_loss.items():
    # Hitung proporsi M berdasarkan Loss
    proporsi = loss / total_loss
    M_region = round(proporsi * M_budget)
    additional_allocation[region] = M_region

    # Filosofi: Semakin tinggi loss (varians), semakin besar alokasi M
    print(f"    Region {region}: Loss={loss:.4f}, Alokasi M: {M_region}")

# ====================================================================
# 5. ADAPTASI DAN PELATIHAN ULANG TABNET
# ====================================================================
print("\n5. ADAPTASI DAN PELATIHAN ULANG TABNET")

# Mengambil sampel tambahan (M)
additional_indices_list = []
for region, M_size in additional_allocation.items():

    # Filter data sisa yang termasuk dalam Region ini
    region_filter = df_rem['Region'] == region
    available_indices = df_rem[region_filter].index.values

    # Ambil M sampel dari indeks yang tersedia
    if len(available_indices) >= M_size:
        M_indices = np.random.choice(available_indices, size=M_size, replace=False)
        additional_indices_list.extend(M_indices)
    else:
        # Jika data tidak cukup, ambil semua yang tersisa
        additional_indices_list.extend(available_indices)

print(f"  Total Sampel Tambahan (M) yang diambil: {len(additional_indices_list)}")

# Gabungkan data N_initial dan M_additional
final_indices = np.concatenate([initial_indices, np.array(additional_indices_list)])
X_final, Y_final = X[final_indices], Y[final_indices]

# Pelatihan ulang pada dataset final yang dioptimalkan
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(X_final, Y_final, test_size=0.3, random_state=42)

model_final = TabNetRegressor()
model_final.fit(
    X_train_final, y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    patience=50,
    max_epochs=100
)

# ====================================================================
# 6. PERBANDINGAN KINERJA AKHIR
# ====================================================================
print("\n==============================================")
print("6. PERBANDINGAN KINERJA AKHIR")
print("==============================================")
print(f"Ukuran Dataset Awal (N): {N_initial_size}")
print(f"Ukuran Dataset Final (N+M): {len(final_indices)}")
print("-" * 45)
print(f"Loss VALIDASI Baseline (N): {model_baseline.history['val_0_mse'][-1]:.4f}")
print(f"Loss VALIDASI Final (N+M Terkalibrasi): {model_final.history['val_0_mse'][-1]:.4f}")
print("\nFilosofi ASS + TabNet: Dengan mengalokasikan sampel tambahan ke strata yang sulit")
print("(loss/varians tinggi), kita berharap model final menunjukkan penurunan loss yang signifikan")
print("dibandingkan jika kita hanya mengambil N+M secara acak.")
print("==============================================")

1. SIMULASI DATA DAN STRATIFIKASI AWAL

2. PENGAMBILAN SAMPEL AWAL (N)




epoch 0  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 1  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 2  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 3  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 4  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 5  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 6  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 7  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 8  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 9  | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 10 | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 11 | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 12 | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 13 | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 14 | loss: 0.0     | val_0_mse: 281984077.38248|  0:00:00s
epoch 15 | loss: 0.0     



  Rata-rata Loss Model Baseline di Sisa Populasi (per Strata):
Region
0    7.225401e+08
1    3.838277e+03
2    1.693780e+03

4. KALIBRASI ADAPTIF (Alokasi Sampel M)
  Total Budget Sampel Tambahan (M): 250
  Alokasi M berdasarkan Loss (M ~ Loss):
    Region 0: Loss=722540109.9937, Alokasi M: 250
    Region 1: Loss=3838.2772, Alokasi M: 0
    Region 2: Loss=1693.7799, Alokasi M: 0

5. ADAPTASI DAN PELATIHAN ULANG TABNET
  Total Sampel Tambahan (M) yang diambil: 250
epoch 0  | loss: 0.0     | val_0_mse: 631450460.35314|  0:00:00s
epoch 1  | loss: 0.0     | val_0_mse: 631450460.35314|  0:00:00s
epoch 2  | loss: 0.0     | val_0_mse: 631450460.35314|  0:00:00s
epoch 3  | loss: 0.0     | val_0_mse: 631450460.35314|  0:00:00s
epoch 4  | loss: 0.0     | val_0_mse: 631450460.35314|  0:00:00s
epoch 5  | loss: 0.0     | val_0_mse: 631450460.35314|  0:00:00s
epoch 6  | loss: 0.0     | val_0_mse: 631450460.35314|  0:00:00s
epoch 7  | loss: 0.0     | val_0_mse: 631450460.35314|  0:00:00s
epoch 8  | l

