<a href="https://colab.research.google.com/github/dimasnurmiraj/221230053-Pengantar-ML/blob/main/week_2/latihan_praktikum_2_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# ==================================================
# Dataset simulasi: 100 samples, 5 features
# ==================================================
np.random.seed(42)
X = np.random.randn(100, 5) * 10 + 5  # Mean=5, Std=10

# ==================================================
# TODO 1: Z-score Normalization
# ==================================================
def z_score_normalization(data):
    """
    Normalisasi Z-score: (x - mean) / std
    Input: data (numpy array)
    Output: normalized data dengan mean=0, std=1
    """
    if data.size == 0:
        raise ValueError("❌ Input data kosong")

    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)

    # Menghindari pembagian dengan 0
    std_corrected = np.where(std == 0, 1, std)

    normalized = (data - mean) / std_corrected
    return normalized

X_normalized = z_score_normalization(X)

# ==================================================
# TODO 2: Handle Outliers
# ==================================================
def handle_outliers(data, std_threshold=3):
    """
    Clip values di luar ±std_threshold
    Input: data (numpy array), std_threshold (default=3)
    Output: data dengan outlier di-clipping
    """
    if data.size == 0:
        raise ValueError("❌ Input data kosong")

    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)

    lower_bound = mean - std_threshold * std
    upper_bound = mean + std_threshold * std

    clipped = np.clip(data, lower_bound, upper_bound)
    return clipped

X_cleaned = handle_outliers(X_normalized)

# ==================================================
# TODO 3: One-hot Encoding
# ==================================================
def one_hot_encoding(labels):
    """
    Convert label kategorikal menjadi one-hot encoding
    Input: labels (1D numpy array)
    Output: 2D numpy array one-hot
    """
    if labels.size == 0:
        raise ValueError("❌ Input labels kosong")

    n_classes = np.max(labels) + 1
    one_hot = np.eye(n_classes)[labels]
    return one_hot

labels = np.array([0, 1, 2, 0, 1, 2, 0])
one_hot_labels = one_hot_encoding(labels)

# ==================================================
# TODO 4: Train-test Split Manual
# ==================================================
def train_test_split_numpy(X, y, test_size=0.2, random_seed=42):
    """
    Split dataset menjadi train dan test secara manual
    Input:
        X (features), y (labels)
        test_size (float, 0-1)
        random_seed (int, optional)
    Output: X_train, X_test, y_train, y_test
    """
    if X.shape[0] != y.shape[0]:
        raise ValueError("❌ Jumlah samples X dan y harus sama")
    if not (0 < test_size < 1):
        raise ValueError("❌ test_size harus antara 0 dan 1")

    np.random.seed(random_seed)
    indices = np.random.permutation(X.shape[0])
    test_count = int(X.shape[0] * test_size)

    test_idx = indices[:test_count]
    train_idx = indices[test_count:]

    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

y = np.random.randint(0, 3, 100)
X_train, X_test, y_train, y_test = train_test_split_numpy(X, y)

# ==================================================
# ✅ Validasi hasil
# ==================================================
assert X_normalized.shape == X.shape, "Shape harus sama setelah normalisasi"
assert np.allclose(X_normalized.mean(), 0, atol=1e-10), "Mean harus ~0 setelah z-score"
assert np.allclose(X_normalized.std(), 1, atol=1e-10), "Std harus ~1 setelah z-score"

print("✅ Semua operasi NumPy berhasil!")
print("\n📌 X_normalized (5 sampel pertama):\n", X_normalized[:5])
print("\n📌 X_cleaned (5 sampel pertama):\n", X_cleaned[:5])
print("\n📌 One-hot labels:\n", one_hot_labels)
print("\n📌 Train shape:", X_train.shape, "| Test shape:", X_test.shape)


✅ Semua operasi NumPy berhasil!

📌 X_normalized (5 sampel pertama):
 [[ 0.604418   -0.21979528  0.75746006  1.461092   -0.18919425]
 [-0.21141045  1.53420502  0.87977344 -0.62585547  0.53845484]
 [-0.46735006 -0.55422449  0.34303561 -2.13809665 -1.58578836]
 [-0.57771567 -1.1129603   0.41687036 -1.08519209 -1.2929218 ]
 [ 1.68601234 -0.30916828  0.16486193 -1.62640854 -0.47982667]]

📌 X_cleaned (5 sampel pertama):
 [[ 0.604418   -0.21979528  0.75746006  1.461092   -0.18919425]
 [-0.21141045  1.53420502  0.87977344 -0.62585547  0.53845484]
 [-0.46735006 -0.55422449  0.34303561 -2.13809665 -1.58578836]
 [-0.57771567 -1.1129603   0.41687036 -1.08519209 -1.2929218 ]
 [ 1.68601234 -0.30916828  0.16486193 -1.62640854 -0.47982667]]

📌 One-hot labels:
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]

📌 Train shape: (80, 5) | Test shape: (20, 5)
