## Imports
---

In [None]:
#external
import pandas as pd
import numpy as np
import joblib

#visualization
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

#sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score 
from sklearn.metrics import recall_score, confusion_matrix, roc_auc_score, roc_curve, auc

#models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#oversampling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

#utils
from src.utils.dataset import get_full_transactions_dataset

## Dataset
---

In [None]:
df = get_full_transactions_dataset().sample(50000)

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.is_laundering.value_counts(True)

## Tranformations
---

In [None]:
def custom_transformations(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe = dataframe.drop(columns=["sender", "receiver"])
    dataframe["timestamp"] = pd.to_datetime(dataframe["timestamp"])
    dataframe["timestamp"] = dataframe["timestamp"].apply(lambda timestamp_value: timestamp_value.value)
    return dataframe

In [None]:
df = custom_transformations(df)

## Defining pipeline
---

In [None]:
X = df.drop(columns=["is_laundering"])
y = df["is_laundering"]

In [None]:
categorical_columns = X.select_dtypes(include="object").columns
numerical_columns = X.select_dtypes(exclude="object").columns

In [None]:
numerical_pipeline = Pipeline([
    ("scaler", RobustScaler())
])

categorical_pipeline = Pipeline([
    ("encoder", OrdinalEncoder())
])

preprocess = ColumnTransformer([
    ("numerical_pipeline", numerical_pipeline, numerical_columns),
    ("categorical_pipeline", categorical_pipeline, categorical_columns)
])

## Train
---

### Train & Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_train.value_counts(True), y_test.value_counts(True) 

In [None]:
X_train = preprocess.fit_transform(X_train)
X_test = preprocess.transform(X_test)

### Oversampling (SMOTE)

In [None]:
oversample = SMOTE(random_state=42, sampling_strategy=0.25)
X_train_resample, y_train_resample = oversample.fit_resample(X_train, y_train)

In [None]:
y_train_resample.value_counts(), y_test.value_counts() 

### Grid search

In [None]:
grid_parameter = {
    "n_estimators": [100, 300, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "min_child_weight": [1, 3, 5]
}

In [None]:
xg_boost_grid = GridSearchCV(
    estimator=XGBClassifier(device="cuda", random_state=42),
    param_grid=grid_parameter,
    cv=3,
    scoring="f1", 
    verbose=3)

In [None]:
xg_boost_model = xg_boost_grid.fit(X_train_resample, y_train_resample)

In [None]:
print(f"Best parameters: {xg_boost_model.best_params_}")
print(f"Best score is: {xg_boost_model.best_score_}")

In [None]:
grid_parameter = {
    "max_depth": [4, 8, 16],
    "n_estimators": [100, 200, 300],
    "max_features": [20, 40, 80]
}

In [None]:
random_forest_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=grid_parameter,
    cv=3,
    scoring="roc_auc", 
    verbose=3)

In [None]:
random_forest_model = random_forest_grid.fit(X_train_resample, y_train_resample)

In [None]:
print(f"Best parameters: {random_forest_model.best_params_}")
print(f"Best score is: {random_forest_model.best_score_}")

## Testing best model
---

In [None]:
full_dataframe = get_full_transactions_dataset()

In [None]:
fraud_transactions_df = full_dataframe[full_dataframe["is_laundering"] == 1]
non_fraud_transactions_df = full_dataframe[full_dataframe["is_laundering"] == 0].sample(int(100e3))
sample_dataframe = pd.concat([fraud_transactions_df, non_fraud_transactions_df])

In [None]:
sample_dataframe.is_laundering.value_counts()

In [None]:
sample_dataframe.shape[0]

In [None]:
sample_dataframe = custom_transformations(sample_dataframe)

In [None]:
X = sample_dataframe.drop(columns=["is_laundering"])
y = sample_dataframe["is_laundering"]

In [None]:
full_pipeline = ImbPipeline([
    ("preprocess", preprocess), 
    ("smote", SMOTE(random_state=42, sampling_strategy=0.25)),
    ("model", XGBClassifier(learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=500, random_state=42))
])

In [None]:
cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)

In [None]:
cross_validation_result = cross_validate(
    full_pipeline,
    X,
    y,
    cv=cv,
    scoring=("f1", "accuracy", "roc_auc"),
    verbose=3
)

## Evaluate
---

In [None]:
y_pred = cross_val_predict(full_pipeline, X, y, cv=cv)
y_pred_prob = cross_val_predict(full_pipeline, X, y, cv=cv, method='predict_proba')[:, 1]

In [None]:
acc = accuracy_score(y, y_pred)
print(f'Accuracy: {acc:.4f}')

f1 = f1_score(y, y_pred)  
print(f'F1 Score: {f1:.4f}')

precision = precision_score(y, y_pred)
print(f'Precision: {precision:.4f}')

recall = recall_score(y, y_pred)
print(f'Recall: {recall:.4f}')

auc_score = roc_auc_score(y, y_pred_prob)
print(f'AUC: {auc_score:.4f}')

In [None]:
cm = confusion_matrix(y, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cbar=False)
plt.title('Confusion matrix')
plt.xlabel('Predicted')
plt.ylabel('Real')
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkblue', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
distances = np.sqrt(fpr**2 + (1 - tpr)**2)
best_threshold_idx = np.argmin(distances)
best_threshold = thresholds[best_threshold_idx]
print(f'O melhor limiar (threshold) é {best_threshold:.4f}')

In [None]:
y_pred_adjusted = (y_pred_prob >= best_threshold).astype(int)

In [None]:
acc = accuracy_score(y, y_pred_adjusted)
print(f'Accuracy: {acc:.4f}')

f1 = f1_score(y, y_pred_adjusted)  
print(f'F1 Score: {f1:.4f}')

precision = precision_score(y, y_pred_adjusted)
print(f'Precision: {precision:.4f}')

recall = recall_score(y, y_pred_adjusted)
print(f'Recall: {recall:.4f}')

In [None]:
cm = confusion_matrix(y, y_pred_adjusted)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cbar=False)
plt.title('Confusion matrix')
plt.xlabel('Predicted')
plt.ylabel('Real')
plt.show()

## Saving model
---

In [None]:
full_pipeline.fit(X, y)

In [None]:
joblib.dump(full_pipeline, "xgb_pipeline.pkl")

## Making prediction
---

In [22]:
df = get_full_transactions_dataset(account_id="1024_800ECB1A0")



In [23]:
def custom_transformations(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe = dataframe.drop(columns=["sender", "receiver", "is_laundering"])
    dataframe["timestamp"] = pd.to_datetime(dataframe["timestamp"])
    dataframe["timestamp"] = dataframe["timestamp"].apply(lambda timestamp_value: timestamp_value.value)
    return dataframe

In [24]:
X = custom_transformations(df)

In [25]:
loaded_pipeline = joblib.load("xgb_pipeline.pkl")

In [26]:
y_pred = loaded_pipeline.predict(X)
y_proba = loaded_pipeline.predict_proba(X)[:, 1]

In [27]:
y_proba[np.argmax(y_proba)]

0.999966

In [29]:
y_proba, y_pred

(array([7.9203364e-06, 6.9044542e-04, 2.6671486e-03, ..., 7.1519884e-05,
        3.3266802e-04, 1.4664488e-02], dtype=float32),
 array([0, 0, 0, ..., 0, 0, 0]))

In [30]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])