# Sistem Deteksi Wajah dan Suku/Etnis

Notebook ini menunjukkan:
1. **Deteksi Wajah**: Menggunakan MTCNN untuk mendeteksi wajah.
2. **Face Similarity**: Menggunakan model Siamese (ResNet50) untuk menghitung *Euclidean distance* antar wajah.
3. **Deteksi Suku/Etnis**: Menggunakan model MobileNetV2 untuk memprediksi suku/etnis.
4. **Evaluasi**: Metrik untuk *face similarity* (precision, recall, F1, TAR, FAR, FRR) dan deteksi suku/etnis (classification report, confusion matrix).
5. **Visualisasi**: ROC curve, distribusi jarak, t-SNE, confusion matrix, dan contoh inferensi.

**Prasyarat**:
- Dataset: `training.csv`, `validation.csv`, `testing.csv` dengan kolom `path`, `nama`, `suku`, `ekspresi`, `sudut`, `jarak`, `pencahayaan`.
- Model: `feature_extractor_siamese.h5` (face similarity), `ethnicity_model.h5` (deteksi suku/etnis), `label_encoder.pkl` (LabelEncoder).
- Gambar: Disimpan di `training/[suku]/[nama]/[nama]_varianX.jpg`, `validation/[suku]/[nama]/[nama]_varianX.jpg`, `testing/[suku]/[nama]/[nama]_varianX.jpg`.
- Dependensi: Lihat `requirements.txt`.

**Struktur Direktori**:
```
project/
├── training.csv
├── validation.csv
├── testing.csv
├── feature_extractor_siamese.h5
├── ethnicity_model.h5
├── label_encoder.pkl
├── training/
│   ├── Jawa/
│   │   ├── John/
│   │   │   ├── john_varian1.jpg
│   │   │   ├── ...
│   ├── Sunda/
│   │   ├── ...
├── validation/
│   ├── Jawa/
│   │   ├── ...
│   ├── Sunda/
│   │   ├── ...
├── testing/
│   ├── Jawa/
│   │   ├── ...
│   ├── Sunda/
│   │   ├── ...
├── face_similarity_ethnicity_detection.ipynb
```

**Catatan**:
- Semua file (model, CSV, notebook) berada di direktori utama.
- Output visualisasi (misalnya, `roc_curve.png`) akan disimpan di direktori utama.

In [None]:
import pandas as pd
import numpy as np
import cv2
import tensorflow as tf
from mtcnn import MTCNN
from tensorflow.keras.models import load_model
from PIL import Image
import os
import pickle
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Konfigurasi
IMG_SIZE = (255, 255)
MODEL_PATH_SIAMESE = 'feature_extractor_siamese.h5'
MODEL_PATH_ETHNICITY = 'ethnicity_model.h5'
LABEL_ENCODER_PATH = 'label_encoder.pkl'

# Inisialisasi MTCNN
detector = MTCNN()

# Muat model Siamese
try:
    feature_model = load_model(MODEL_PATH_SIAMESE)
    feature_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
    print("Model Siamese berhasil dimuat.")
except Exception as e:
    print(f"Error memuat model Siamese: {e}")
    raise

# Muat model deteksi suku/etnis dan LabelEncoder
try:
    ethnicity_model = load_model(MODEL_PATH_ETHNICITY)
    with open(LABEL_ENCODER_PATH, 'rb') as f:
        le = pickle.load(f)
    print("Model deteksi suku/etnis dan LabelEncoder berhasil dimuat.")
except Exception as e:
    print(f"Error memuat model deteksi suku/etnis: {e}")
    raise

In [None]:
# Fungsi untuk memuat dan preprocess gambar (Face Similarity)
def load_image(image_path):
    try:
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Gambar tidak ditemukan: {image_path}")
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img_rgb, IMG_SIZE).astype('float32') / 255.0
        return img
    except Exception as e:
        print(f"Error memuat gambar {image_path}: {e}")
        return None

# Fungsi untuk preprocess gambar dengan deteksi wajah (Deteksi Suku/Etnis)
def preprocess_image(img_rgb, target_size=(255, 255)):
    try:
        faces = detector.detect_faces(img_rgb)
        if len(faces) > 0:
            x, y, w, h = faces[0]['box']
            x, y = max(x, 0), max(y, 0)
            face = img_rgb[y:y+h, x:x+w]
            face = cv2.resize(face, target_size).astype('float32') / 255.0
            return face
        else:
            raise ValueError("Wajah tidak terdeteksi.")
    except Exception as e:
        print(f"Error preprocess gambar: {e}")
        return None

# Fungsi ekstraksi embedding (Face Similarity)
def extract_embedding(face, model):
    face = np.expand_dims(face, axis=0)
    embedding = model.predict(face)
    return embedding[0]

# Fungsi Euclidean Distance (Face Similarity)
def euclidean_distance_inference(emb1, emb2):
    return np.sqrt(np.sum((emb1 - emb2) ** 2))

# Konversi jarak ke skor kemiripan (Face Similarity)
def distance_to_similarity(distance, max_distance=5.0):
    similarity = max(0, 1 - (distance / max_distance))
    return similarity

# Fungsi visualisasi wajah dengan bounding box
def draw_bbox(image_path):
    try:
        img = cv2.imread(image_path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        faces = detector.detect_faces(img_rgb)
        if faces:
            for face in faces:
                x, y, w, h = face['box']
                cv2.rectangle(img_rgb, (x, y), (x+w, y+h), (0, 255, 0), 2)
        return img_rgb
    except Exception as e:
        print(f"Error visualisasi {image_path}: {e}")
        return None

In [None]:
# Validasi dataset
def validate_dataset(csv_file):
    try:
        df = pd.read_csv(csv_file, sep=';')
        expected_columns = ['path', 'nama', 'suku', 'ekspresi', 'sudut', 'jarak', 'pencahayaan']
        for col in expected_columns:
            if col not in df.columns:
                raise ValueError(f"Kolom {col} tidak ditemukan di {csv_file}")
        
        invalid_paths = []
        for path in df['path']:
            if not os.path.exists(path):
                invalid_paths.append(path)
        if invalid_paths:
            print(f"Path tidak valid di {csv_file}: {invalid_paths}")
        
        counts = df['nama'].value_counts()
        if any(counts < 2):
            print(f"Peringatan: Individu dengan <2 gambar di {csv_file}: {counts[counts < 2]}")
        
        print(f"{csv_file}: {len(df)} gambar, {len(counts)} individu, {len(df['suku'].unique())} suku")
        return df
    except Exception as e:
        print(f"Error validasi {csv_file}: {e}")
        raise

# Validasi semua CSV
CSV_FILES = ['training.csv', 'validation.csv', 'testing.csv']
dfs = {}
for csv_file in CSV_FILES:
    dfs[csv_file] = validate_dataset(csv_file)

In [None]:
# Evaluasi Face Similarity
def evaluate_face_similarity(df, model):
    distances = []
    labels = []
    pairs_info = []
    
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            img1_path = df['path'][i]
            img2_path = df['path'][j]
            face1 = load_image(img1_path)
            face2 = load_image(img2_path)
            if face1 is None or face2 is None:
                continue
            emb1 = extract_embedding(face1, model)
            emb2 = extract_embedding(face2, model)
            dist = euclidean_distance_inference(emb1, emb2)
            distances.append(dist)
            label = 1 if df['nama'][i] == df['nama'][j] else 0
            labels.append(label)
            pairs_info.append({
                'image1': img1_path,
                'image2': img2_path,
                'distance': dist,
                'label': label
            })
    
    # Hitung ROC dan threshold
    fpr, tpr, thresholds = roc_curve(labels, [-d for d in distances])
    auc_score = auc(fpr, tpr)
    far_threshold = thresholds[np.where(fpr < 0.05)[0][-1]] if np.any(fpr < 0.05) else thresholds[0]
    
    # Hitung EER
    eer_idx = np.argmin(np.abs(fpr - (1 - tpr)))
    eer = fpr[eer_idx]
    
    # Visualisasi ROC curve
    plt.figure()
    plt.plot(fpr, tpr, label=f'ROC (AUC = {auc_score:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.scatter(fpr[eer_idx], tpr[eer_idx], marker='o', color='red', label=f'EER = {eer:.4f}')
    plt.title('ROC Curve Face Similarity')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.savefig('roc_curve.png')
    plt.show()
    plt.close()
    
    # Visualisasi distribusi jarak
    plt.figure()
    sns.histplot([d for d, l in zip(distances, labels) if l == 1], label='Same Identity', color='blue', alpha=0.5)
    sns.histplot([d for d, l in zip(distances, labels) if l == 0], label='Different Identity', color='red', alpha=0.5)
    plt.legend()
    plt.title('Distribusi Jarak Euclidean (Same vs Different Identity)')
    plt.savefig('distance_distribution.png')
    plt.show()
    plt.close()
    
    # Hitung metrik
    predictions = [1 if d < far_threshold else 0 for d in distances]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    cm = confusion_matrix(labels, predictions)
    tn, fp, fn, tp = cm.ravel()
    tar = tp / (tp + fn) if (tp + fn) > 0 else 0
    far = fp / (fp + tn) if (fp + tn) > 0 else 0
    frr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    # Visualisasi confusion matrix
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix Face Similarity')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig('confusion_matrix_facesimilarity.png')
    plt.show()
    plt.close()
    
    print(f"Threshold dengan FAR < 0.05: {far_threshold:.4f}, AUC: {auc_score:.4f}")
    print(f"EER: {eer:.4f}")
    print(f"TAR: {tar:.4f}, FAR: {far:.4f}, FRR: {frr:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    
    return far_threshold, pairs_info

# Jalankan evaluasi Face Similarity
far_threshold, pairs_info = evaluate_face_similarity(dfs['testing.csv'], feature_model)

In [None]:
# Visualisasi t-SNE untuk Face Similarity
def visualize_tsne(df, model):
    embeddings = []
    tsne_labels = []
    for img_path, label in zip(df['path'], df['nama']):
        face = load_image(img_path)
        if face is None:
            continue
        emb = extract_embedding(face, model)
        embeddings.append(emb)
        tsne_labels.append(label)
    
    n_samples = len(embeddings)
    if n_samples < 2:
        print("Tidak cukup embedding untuk t-SNE.")
        return
    
    perplexity_value = min(5, n_samples - 1)
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value)
    embeddings_2d = tsne.fit_transform(np.array(embeddings))
    plt.figure()
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=pd.factorize(tsne_labels)[0])
    plt.title('t-SNE Embedding Face Similarity')
    plt.savefig('tsne_embedding.png')
    plt.show()
    plt.close()

# Jalankan t-SNE
visualize_tsne(dfs['testing.csv'], feature_model)

In [None]:
# Evaluasi Deteksi Suku/Etnis
def evaluate_ethnicity(df, model, label_encoder):
    images = []
    true_labels = []
    
    for img_path, suku in zip(df['path'], df['suku']):
        img = cv2.imread(img_path)
        if img is None:
            continue
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        face = preprocess_image(img_rgb)
        if face is None:
            continue
        images.append(face)
        true_labels.append(suku)
    
    if not images:
        print("Tidak ada gambar valid untuk evaluasi deteksi suku/etnis.")
        return
    
    images = np.array(images)
    true_labels_enc = label_encoder.transform(true_labels)
    pred = model.predict(images)
    pred_classes = np.argmax(pred, axis=1)
    
    print("Evaluasi Deteksi Suku/Etnis (Testing):")
    print(classification_report(true_labels_enc, pred_classes, target_names=label_encoder.classes_))
    
    cm = confusion_matrix(true_labels_enc, pred_classes)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix Deteksi Suku/Etnis')
    plt.savefig('confusion_matrix_ethnicity.png')
    plt.show()
    plt.close()

# Jalankan evaluasi deteksi suku/etnis
evaluate_ethnicity(dfs['testing.csv'], ethnicity_model, le)

In [None]:
# Contoh Inferensi
def run_inference(img_path1, img_path2, feature_model, ethnicity_model, label_encoder, threshold):
    # Face Similarity
    face1 = load_image(img_path1)
    face2 = load_image(img_path2)
    if face1 is None or face2 is None:
        print("Gagal memuat salah satu gambar.")
        return
    emb1 = extract_embedding(face1, feature_model)
    emb2 = extract_embedding(face2, feature_model)
    dist = euclidean_distance_inference(emb1, emb2)
    similarity_score = distance_to_similarity(dist)
    is_match = dist < threshold
    
    # Deteksi Suku/Etnis
    img1 = cv2.imread(img_path1)
    img2 = cv2.imread(img_path2)
    img1_rgb = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
    img2_rgb = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
    face1_eth = preprocess_image(img1_rgb)
    face2_eth = preprocess_image(img2_rgb)
    suku_pred1 = None
    suku_pred2 = None
    prob_dict1 = None
    prob_dict2 = None
    if face1_eth is not None:
        face1_input = np.expand_dims(face1_eth, axis=0)
        pred1 = ethnicity_model.predict(face1_input)
        suku_pred1 = label_encoder.inverse_transform([np.argmax(pred1)])[0]
        prob_dict1 = {label_encoder.classes_[i]: pred1[0][i] for i in range(len(label_encoder.classes_))}
    if face2_eth is not None:
        face2_input = np.expand_dims(face2_eth, axis=0)
        pred2 = ethnicity_model.predict(face2_input)
        suku_pred2 = label_encoder.inverse_transform([np.argmax(pred2)])[0]
        prob_dict2 = {label_encoder.classes_[i]: pred2[0][i] for i in range(len(label_encoder.classes_))}
    
    # Visualisasi
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    img1_viz = draw_bbox(img_path1)
    if img1_viz is not None:
        plt.imshow(img1_viz)
        plt.title(f"Suku: {suku_pred1 if suku_pred1 else 'Tidak Terdeteksi'}")
        plt.axis('off')
    
    plt.subplot(1, 2, 2)
    img2_viz = draw_bbox(img_path2)
    if img2_viz is not None:
        plt.imshow(img2_viz)
        plt.title(f"Suku: {suku_pred2 if suku_pred2 else 'Tidak Terdeteksi'}")
        plt.axis('off')
    plt.tight_layout()
    plt.savefig('inference_example.png')
    plt.show()
    plt.close()
    
    print(f"Skor Kemiripan: {similarity_score:.4f} (Jarak Euclidean: {dist:.4f})")
    print(f"Status: {'Match' if is_match else 'Tidak Match'}")
    if suku_pred1:
        print(f"Suku Gambar 1: {suku_pred1}")
        print("Probabilitas per Suku:")
        for suku, prob in prob_dict1.items():
            print(f"{suku}: {prob:.2%}")
    if suku_pred2:
        print(f"Suku Gambar 2: {suku_pred2}")
        print("Probabilitas per Suku:")
        for suku, prob in prob_dict2.items():
            print(f"{suku}: {prob:.2%}")

# Jalankan inferensi untuk dua gambar
img_path1 = dfs['testing.csv']['path'].iloc[0]
img_path2 = dfs['testing.csv']['path'].iloc[1]
run_inference(img_path1, img_path2, feature_model, ethnicity_model, le, far_threshold)