# 🧠 EDA dan Preprocessing: Dataset Kartu Pokémon Asli vs. Palsu

Notebook ini bertujuan untuk melakukan Analisis Data Eksplorasi (EDA) dan Preprocessing pada dataset gambar kartu Pokémon untuk memahami karakteristik yang membedakan kartu asli (real) dan palsu (fake) sebelum tahap modeling.

📎 Link Dataset: https://www.kaggle.com/datasets/ongshujian/real-and-fake-pokemon-cards/data

## 1. Data Acquisition & Library

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
tqdm.pandas()
print('✅ Library siap digunakan!')

## 2. Memuat Data dan Label

In [None]:
train_img_path = 'data/train/'
train_labels_path = 'data/train_labels.csv'

labels_df = pd.read_csv(train_labels_path)
labels_df['id'] = labels_df['id'].astype(str)
labels_df['filepath'] = labels_df['id'].apply(lambda x: os.path.join(train_img_path, x + '.jpg'))
labels_df['label'] = labels_df['label'].map({0: 'real', 1: 'fake'})
df = labels_df[['filepath','label']].copy().sample(frac=1, random_state=42).reset_index(drop=True)
print('DataFrame berhasil dimuat.')
df.head()

## 3. EDA: Distribusi Label dan Contoh Gambar

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='label', data=df, palette='pastel')
plt.title('Distribusi Label Kartu Pokémon')
plt.show()

def tampilkan_contoh(label_name, jumlah=5):
    plt.figure(figsize=(18,5))
    sample_paths = df[df['label']==label_name]['filepath'].sample(jumlah, random_state=1).values
    for i, fp in enumerate(sample_paths):
        plt.subplot(1, jumlah, i+1)
        plt.imshow(Image.open(fp))
        plt.axis('off')
    plt.suptitle(f'Contoh Kartu {label_name.upper()}')
    plt.show()

tampilkan_contoh('real')
tampilkan_contoh('fake')

## 4. Feature Extraction: Color Histogram

In [None]:
def extract_color_histogram(image_path, bins=(8, 8, 8)):
    image = cv2.imread(image_path)
    if image is None:
        return None
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    hist = cv2.calcHist([image], [0, 1, 2], None, bins, [0,256,0,256,0,256])
    cv2.normalize(hist, hist)
    return hist.flatten()

df['hist_features'] = df['filepath'].progress_apply(extract_color_histogram)
df = df.dropna(subset=['hist_features']).reset_index(drop=True)
print('Ekstraksi fitur Color Histogram selesai.')

### Visualisasi Histogram Warna Kartu Asli vs. Palsu

In [None]:
def show_histogram_comparison(real_path, fake_path):
    plt.figure(figsize=(14,6))
    for idx, (path, title) in enumerate(zip([real_path, fake_path], ['REAL','FAKE'])):
        img = cv2.imread(path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(2,3,3*idx+1)
        plt.imshow(img_rgb)
        plt.title(f'Gambar {title}')
        plt.axis('off')
        plt.subplot(2,3,3*idx+2)
        for i,col in enumerate(('r','g','b')):
            hist = cv2.calcHist([img_rgb],[i],None,[256],[0,256])
            plt.plot(hist,color=col)
        plt.title(f'Histogram {title}')
        plt.xlim([0,256])
    plt.tight_layout()
    plt.show()

real_path = df[df['label']=='real']['filepath'].iloc[0]
fake_path = df[df['label']=='fake']['filepath'].iloc[0]
show_histogram_comparison(real_path, fake_path)

## 5. Feature Extraction Tambahan: Edge dan Corner Detection

In [None]:
def extract_edge_corner_features(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    edge_density = np.sum(edges>0)/edges.size
    corners = cv2.cornerHarris(np.float32(gray),2,3,0.04)
    corner_density = np.sum(corners>0.01*corners.max())/corners.size
    return edge_density, corner_density

df[['edge_density','corner_density']] = pd.DataFrame(df['filepath'].progress_apply(extract_edge_corner_features).tolist())
print('Ekstraksi fitur Edge & Corner selesai.')

### Visualisasi Contoh Edge dan Corner Detection

In [None]:
def visualize_edges_corners(path):
    img = cv2.imread(path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray,100,200)
    corners = cv2.cornerHarris(np.float32(gray),2,3,0.04)
    img_corners = img.copy()
    img_corners[corners>0.01*corners.max()] = [255,0,0]

    plt.figure(figsize=(12,5))
    plt.subplot(1,3,1)
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.title('Asli')
    plt.axis('off')
    plt.subplot(1,3,2)
    plt.imshow(edges, cmap='gray')
    plt.title('Edges (Canny)')
    plt.axis('off')
    plt.subplot(1,3,3)
    plt.imshow(cv2.cvtColor(img_corners, cv2.COLOR_BGR2RGB))
    plt.title('Corners (Harris)')
    plt.axis('off')
    plt.show()

print('--- Kartu REAL ---')
visualize_edges_corners(real_path)
print('\n--- Kartu FAKE ---')
visualize_edges_corners(fake_path)

## 6. Gabungkan Semua Fitur dan Encode Label

In [None]:
X_hist = np.array(df['hist_features'].tolist())
extra_features = df[['edge_density','corner_density']].values
X_combined = np.hstack([X_hist, extra_features])

le = LabelEncoder()
y = le.fit_transform(df['label'])
print(f'✅ Fitur gabungan siap: {X_combined.shape}')

## 7. Train-Test Split dan Simpan Data Siap Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

np.savez('pokemon_features_ready.npz', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
print('💾 Data fitur siap dan disimpan sebagai pokemon_features_ready.npz')