In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV,train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### 讀取資料

In [None]:
#讀取資料
df_row = pd.read_csv("archive/credit_card_transactions-ibm_v2.csv")

#### 資料前處理

In [None]:
df=df_row.copy()

#資料型態及文字處理
df["card_id"] = df["User"].astype(str) + "_" + df["Card"].astype(str)
df["Amount"]=df["Amount"].str.replace("$","").astype(float)
df["Hour"] = df["Time"].str [0:2].astype(float)
df["Minute"] = df["Time"].str [3:5].astype(float)

#缺失值填補
df["Errors?"]= df["Errors?"].fillna("No error")
df['Merchant State'].fillna(0,inplace=True)

#預測標籤轉換
df["Is Fraud?"] = df["Is Fraud?"].apply(lambda x: 1 if x == 'Yes' else 0)

#類別變數轉換成頻率等級
df['Merchant City freq'] = df['Merchant City'].map(df['Merchant City'].value_counts()/len(df))
df['Use Chip freq'] = df['Use Chip'].map(df['Use Chip'].value_counts()/len(df))
df['Errors? freq'] = df['Errors?'].map(df['Errors?'].value_counts()/len(df))
df['card_id freq'] = df['card_id'].map(df['card_id'].value_counts()/len(df))
df['Merchant State freq'] = df['Merchant State'].map(df['Merchant State'].value_counts()/len(df))
df['MCC freq'] = df['MCC'].map(df['MCC'].value_counts()/len(df))

In [None]:
df = df.drop(columns=["Merchant City","Use Chip","Errors?",'card_id',
                      "Time","User","Card",'Year','Merchant State',"Zip",'MCC'],axis=1)

In [None]:
#數據採樣及建構資料集
x_nom = df[df["Is Fraud?"] == 0].sample(1000000,random_state=42)
x_nov = df[df["Is Fraud?"] == 1]
df = pd.concat([x_nom,x_nov])

In [None]:
df.head()
df['Is Fraud?'].value_counts()

In [None]:
df.info()

#### 模型建立

In [None]:
#XGBoost 模型建立
def XGB(X_train, y_train, X_test, y_test):
    #建立參數網格
    param_grid = {
        'max_depth': [2, 3, 4, 5],                    #最大深度
        'learning_rate': [0.01, 0.04, 0.05, 0,1],     #學習率
        'n_estimators': [100, 200, 300],              #樹的數量      
        'subsample': [0.8, 0.9, 1.0],                 #採樣比例
        'colsample_bytree': [0.8, 0.9, 1.0],          #特徵採樣比例
    }
    
    #建立分類器
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    
    #網格訓練
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

#LightGBM 模型建立(參數請見XGBoost 模型)
def Lightgbm(X_train, y_train, X_test, y_test):
    param_grid = {
        'max_depth': [2, 3, 4, 5],
        'learning_rate': [0.01, 0.04, 0.05, 0.1],
        'n_estimators': [100, 200, 300],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'num_leaves': [7, 15, 31],                   #樹的葉子數量
        'force_col_wise': [True],                    #是否強制列存儲,用於優化速度的參數                                            
    }
    lgb_model = LGBMClassifier(verbose=-1)
    grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=0)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

#Catboost 模型建立(參數請見XGBoost 模型)
def Catboost(X_train, y_train, X_test, y_test):
    param_grid = {
        'max_depth': [2, 3, 4, 5],
        'learning_rate': [0.01, 0.04, 0.05, 0,1],
        'iterations': [100, 200, 300],
        'subsample': [0.8, 0.9, 1.0],
    }
    cat_model = CatBoostClassifier(loss_function='Logloss', verbose=False)
    grid_search = GridSearchCV(estimator=cat_model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_


#### 切分訓練及測試資料集 (80/20)

In [None]:
#切分訓練及測試資料集
def split_data(df_filtered_columns):
    X = df_filtered_columns.drop(['Is Fraud?'], axis=1)
    y = df_filtered_columns['Is Fraud?']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = split_data(df)

#### 模型訓練

In [None]:
#模型訓練
XGB_model = XGB(X_train, y_train, X_test, y_test)
Lightgbm_model = Lightgbm(X_train, y_train, X_test, y_test)
Catboost_model = Catboost(X_train, y_train, X_test, y_test)

#### 模型評估

In [None]:
# 模型評估

#繪製混淆矩陣
def plot_confusion_matrix(y_true, y_pred, title='Confusion Matrix'):
    # 混淆矩陣
    cm = confusion_matrix(y_true, y_pred)
    f, ax = plt.subplots(figsize=(8, 5))
    sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, ax=ax, cmap='Blues')
    plt.title(title)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()
    
# 輸出模型分數及混淆矩陣
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{model_name} Accuracy: {accuracy}")
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))

    plot_confusion_matrix(y_test, y_pred, title=f'{model_name} Confusion Matrix')

    return model

XGB_model_eval = evaluate_model(XGB_model, X_train, y_train, X_test, y_test, model_name="XGBoost")
LightGBM_model_eval = evaluate_model(Lightgbm_model, X_train, y_train, X_test, y_test, model_name="LightGBM")
CatBoost_model_eval = evaluate_model(Catboost_model, X_train, y_train, X_test, y_test, model_name="CatBoost")
