In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_score, recall_score, f1_score,precision_recall_curve,auc
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv('./data/creditcard.csv')

In [3]:
# Scale the 'Amount' feature
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])

# Drop the 'Time' feature
df = df.drop(['Time'], axis=1)

In [4]:
# 'Class' is target variable
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(y_train.value_counts(normalize=True))

Class
0    0.998271
1    0.001729
Name: proportion, dtype: float64


In [6]:
# Compute class weights for balanced models
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Initialize Decision Tree Classifier with balanced class weights
dt_clf = DecisionTreeClassifier(random_state=42, class_weight="balanced")

# Train the Decision Tree model
dt_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = dt_clf.predict(X_test)

# Print performance metrics
print(f"Decision Tree - Precision: {precision_score(y_test, y_pred):.2f}, Recall: {recall_score(y_test, y_pred):.2f}, F1-Score: {f1_score(y_test, y_pred):.2f}")

Decision Tree - Precision: 0.77, Recall: 0.73, F1-Score: 0.75


Decision Tree - Precision: 0.77, Recall: 0.73, F1-Score: 0.75

In [10]:
# Decision Tree Classifier Optimization

param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='recall', n_jobs=-1)
dt_grid_search.fit(X_train, y_train)
best_dt = dt_grid_search.best_estimator_

# Make predictions with the optimized Decision Tree model
y_pred_optimized = best_dt.predict(X_test)

# Calculate and print the performance metrics for the optimized Decision Tree model
print(f"Optimized Decision Tree - Precision: {precision_score(y_test, y_pred_optimized):.2f}, Recall: {recall_score(y_test, y_pred_optimized):.2f}, F1-Score: {f1_score(y_test, y_pred_optimized):.2f}")


Optimized Decision Tree - Precision: 0.87, Recall: 0.76, F1-Score: 0.81


Optimized Decision Tree - Precision: 0.87, Recall: 0.76, F1-Score: 0.81