In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_recall_curve

In [None]:
df = pd.read_csv("PS_20174392719_1491204439457_log.csv")
df = df.drop(['nameOrig', 'nameDest'], axis=1)
df = pd.get_dummies(df, columns=['type'], drop_first=True)

In [None]:
df['erro_saldo_origem'] = df['oldbalanceOrg'] - df['amount'] - df['newbalanceOrig']
df['erro_saldo_destino'] = df['oldbalanceDest'] + df['amount'] - df['newbalanceDest']

In [None]:
y = df['isFraud']
X = df.drop(['isFraud'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=557660
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:

X_sample, _, y_sample, _ = train_test_split(
    X_train_scaled, y_train,
    stratify=y_train,
    train_size=50000,
    random_state=557660
)

knn = KNeighborsClassifier(n_jobs=-1)

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2] # 1: Manhattan, 2: Euclidiana
}

grid = GridSearchCV(
    knn,
    param_grid,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid.fit(X_sample, y_sample)

print("Melhores Parâmetros Encontrados:", grid.best_params_)
best_knn_params = grid.best_params_

In [None]:
modelo_final = KNeighborsClassifier(**best_knn_params, n_jobs=-1)
modelo_final.fit(X_train_scaled, y_train)

In [None]:
y_proba = modelo_final.predict_proba(X_test_scaled)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

f1_scores = 2 * (precisions * recalls) / (precisions * recalls + 1e-7)
f1_scores = f1_scores[:-1]
thresholds_aligned = thresholds[:len(f1_scores)]

best_f1_idx = np.argmax(f1_scores)
best_threshold = thresholds_aligned[best_f1_idx]

print(f"Melhor Limiar de Decisão Encontrado: {best_threshold:.4f}")

In [None]:
y_pred_final = (y_proba >= best_threshold).astype(int)

print("\n--- RESULTADOS FINAIS DO MODELO KNN OTIMIZADO ---")
print(classification_report(y_test, y_pred_final, digits=4))

mat_conf = confusion_matrix(y_test, y_pred_final)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_conf, annot=True, fmt='d', cmap='viridis',
            xticklabels=['Normal', 'Fraude'], 
            yticklabels=['Normal', 'Fraude'])
plt.title('Matriz de Confusão Final (Limiar Otimizado)', fontsize=16)
plt.ylabel('Classe Verdadeira', fontsize=12)
plt.xlabel('Classe Prevista', fontsize=12)
plt.show()

In [None]:
joblib.dump(modelo_final, "modelo_knn_fraude.pkl")
joblib.dump(scaler, "scaler_fraude.pkl")

print("\nModelo e Scaler salvos com sucesso!")