In [None]:
# 信用卡詐欺偵測 - ex1.ipynb

# 安裝必要套件
!pip install kagglehub --quiet

# 匯入套件
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import kagglehub

# 一般設定
TEST_SIZE = 0.3
RANDOM_SEED = 42

# 載入資料集
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# 前處理
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# 印出詐欺比例
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f"Fraudulent: {len(fraud)}, Non-fraudulent: {len(nonfraud)}")
print(f"Positive class (fraud) percentage: {len(fraud) / len(data) * 100:.3f}%")

# ===========================
# 監督式學習 - Random Forest
# ===========================
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

print("\nRandom Forest Result:")
print(classification_report(y_test, y_pred))

# ===========================
# 非監督式學習 - KMeans
# ===========================
x_train_unsup = X_train[y_train == 0][:1000]

scores = []
for k in range(2, 5):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    kmeans.fit(x_train_unsup)
    scores.append(silhouette_score(x_train_unsup, kmeans.labels_))

optimal_k = np.argmax(scores) + 2
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(x_train_unsup)
y_pred_test = kmeans.predict(X_test)

def align_labels(y_true, y_pred, n_clusters):
    labels = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = np.bincount(y_true[mask]).argmax()
        else:
            labels[mask] = 0
    return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)

print("\nKMeans Result:")
print(classification_report(y_test, y_pred_aligned))
