In [30]:
# Importação de bibliotecas para manipulação e visualização de dados
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configurações para melhorar a visualização dos gráficos
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (15, 8)
plt.rcParams['font.size'] = 12

print("Ambiente configurado e bibliotecas importadas com sucesso!")

# Carregando os dados. Substitua 'seu_dataset.csv' pelo nome do seu arquivo.
try:
    df = pd.read_csv('../data/bank_transactions_data.csv')
    print("Dataset carregado com sucesso!")
    print(f"O dataset contém {df.shape[0]} linhas e {df.shape[1]} colunas.")
except FileNotFoundError:
    print("ERRO: Arquivo não encontrado. Verifique o caminho e nome do arquivo.")

Ambiente configurado e bibliotecas importadas com sucesso!
Dataset carregado com sucesso!
O dataset contém 2512 linhas e 16 colunas.


# Carregar os Dados

In [31]:
import numpy as np
import pickle

# Carregar os dados processados da Fase 2
X_processed = np.load('../data/processed/X_processed.npy')

# Carregar o scaler (útil para novos dados)
with open('../data/processed/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Carregar nomes das features
with open('../data/processed/feature_names.pkl', 'rb') as f:
    feature_names = pickle.load(f)

# Carregar resumo do preprocessing
with open('../data/processed/preprocessing_summary.pkl', 'rb') as f:
    summary = pickle.load(f)

print(f"✅ Dados carregados de: ../data/processed/")
print(f"   - Shape dos dados: {X_processed.shape}")
print(f"   - Features disponíveis: {len(feature_names)}")
print(f"   - Data do preprocessing: {summary.get('preprocessing_date', 'N/A')}")

✅ Dados carregados de: ../data/processed/
   - Shape dos dados: (2512, 15)
   - Features disponíveis: 15
   - Data do preprocessing: 2025-06-25 21:58:15


# Treinando o Modelo Isolation Forest

In [32]:
from sklearn.ensemble import IsolationForest
import joblib
import os
import numpy as np
import pandas as pd
import pickle

# Inicializando o modelo
# 'contamination' é a nossa estimativa da proporção de anomalias no dataset.
# É o parâmetro mais importante. Um bom ponto de partida é um valor baixo, como 0.01 (1%) ou 0.02 (2%).
# 'random_state' garante que os resultados sejam os mesmos toda vez que rodarmos o código.
iso_forest = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)

# Treinando o modelo e fazendo a predição ao mesmo tempo.
# O método fit_predict() retorna:
#  1  para transações normais (inliers)
# -1  para anomalias (outliers)
predictions = iso_forest.fit_predict(X_processed)

print("Modelo treinado e predições realizadas!")
print(f"Total de predições: {len(predictions)}")
print(f"Formato dos dados processados: {X_processed.shape}")
print(f"Features utilizadas: {len(feature_names)}")

# Criando estrutura organizada de diretórios
models_dir = '../models'
results_dir = '../results'
plots_dir = '../results/plots'

for directory in [models_dir, results_dir, plots_dir]:
    os.makedirs(directory, exist_ok=True)

# Salvando o modelo em models/
model_path = f'{models_dir}/isolation_forest_model.pkl'
joblib.dump(iso_forest, model_path)
print(f"✅ Modelo salvo em: {model_path}")

# Salvando as predições em results/
predictions_path = f'{results_dir}/predictions.npy'
np.save(predictions_path, predictions)
print(f"✅ Predições salvas em: {predictions_path}")

# Salvando resumo do modelo
model_summary = {
    'model_type': 'IsolationForest',
    'parameters': {
        'n_estimators': iso_forest.n_estimators,
        'contamination': iso_forest.contamination,
        'random_state': iso_forest.random_state,
        'max_samples': iso_forest.max_samples,
        'max_features': iso_forest.max_features
    },
    'training_data_shape': X_processed.shape,
    'features_used': feature_names,
    'predictions_summary': {
        'total_predictions': len(predictions),
        'anomalies_detected': int(np.sum(predictions == -1)),
        'normal_transactions': int(np.sum(predictions == 1)),
        'contamination_actual': float(np.sum(predictions == -1) / len(predictions))
    },
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

with open(f'{models_dir}/model_summary.pkl', 'wb') as f:
    pickle.dump(model_summary, f)
print(f"✅ Resumo do modelo salvo em: {models_dir}/model_summary.pkl")

print(f"\n🎉 Modelo pronto para uso!")
print(f"📁 Modelo salvo em: {models_dir}/")
print(f"📊 Predições salvas em: {results_dir}/")

Modelo treinado e predições realizadas!
Total de predições: 2512
Formato dos dados processados: (2512, 15)
Features utilizadas: 15
✅ Modelo salvo em: ../models/isolation_forest_model.pkl
✅ Predições salvas em: ../results/predictions.npy
✅ Resumo do modelo salvo em: ../models/model_summary.pkl

🎉 Modelo pronto para uso!
📁 Modelo salvo em: ../models/
📊 Predições salvas em: ../results/


# Análise das Anomalias Detectadas

In [33]:
# Contando o número de anomalias (-1) e transações normais (1)
print("\n--- Contagem de Anomalias vs. Normais ---")
print(f"Anomalias (-1): {np.sum(predictions == -1)}")
print(f"Normais (1): {np.sum(predictions == 1)}")

# Criando DataFrames com os dados processados para análise
# Convertendo X_processed de volta para DataFrame usando os nomes das features
df_processed = pd.DataFrame(X_processed, columns=feature_names)

# Adicionando as predições ao DataFrame processado
df_processed['anomaly_score'] = predictions

# Criando DataFrames separados para anomalias e transações normais
anomalies_processed = df_processed[df_processed['anomaly_score'] == -1]
normals_processed = df_processed[df_processed['anomaly_score'] == 1]

print(f"\nTotal de {len(anomalies_processed)} anomalias detectadas.")
print("\n--- Amostra das Transações Anômalas (Dados Processados) ---")
display(anomalies_processed.head())

# Comparando o perfil estatístico das anomalias com as transações normais
# Usando todas as features disponíveis (exceto a coluna de score)
features_for_analysis = [col for col in feature_names if col != 'anomaly_score']

print("\n--- PERFIL ESTATÍSTICO DAS ANOMALIAS ---")
print("(Dados após preprocessing - escalados)")
display(anomalies_processed[features_for_analysis].describe())

print("\n--- PERFIL ESTATÍSTICO DAS TRANSAÇÕES NORMAIS ---")
print("(Dados após preprocessing - escalados)")
display(normals_processed[features_for_analysis].describe())

# Para uma análise mais interpretável, vamos também mostrar as diferenças mais significativas
print("\n--- ANÁLISE DAS DIFERENÇAS ENTRE ANOMALIAS E NORMAIS ---")
anomaly_means = anomalies_processed[features_for_analysis].mean()
normal_means = normals_processed[features_for_analysis].mean()
differences = anomaly_means - normal_means

# Ordenando as diferenças para identificar as features mais discriminativas
differences_sorted = differences.abs().sort_values(ascending=False)

print("Features mais discriminativas (ordenadas por diferença absoluta nas médias):")
for feature in differences_sorted.index[:10]:  # Top 10 features mais diferentes
    diff = differences[feature]
    print(f"{feature}: {diff:.3f} {'(anomalias maior)' if diff > 0 else '(normais maior)'}")



--- Contagem de Anomalias vs. Normais ---
Anomalias (-1): 51
Normais (1): 2461

Total de 51 anomalias detectadas.

--- Amostra das Transações Anômalas (Dados Processados) ---


Unnamed: 0,TransactionAmount,CustomerAge,TransactionDuration,LoginAttempts,AccountBalance,TransactionHour,DayOfWeek,TimeSinceLastTransaction,AmountToBalanceRatio,TransactionType_Debit,Channel_Branch,Channel_Online,CustomerOccupation_Engineer,CustomerOccupation_Retired,CustomerOccupation_Student,anomaly_score
85,3.571903,0.524269,-1.281537,-0.206794,0.907648,0.511423,1.688773,0.807362,-0.088234,-1.85001,-0.726623,1.448244,1.737585,-0.559572,-0.595128,-1
147,0.744656,-0.206534,0.31961,6.431745,-1.203121,1.879574,1.026963,-0.781131,1.993986,0.540538,-0.726623,1.448244,-0.575512,-0.559572,-0.595128,-1
193,-0.058323,-1.105984,-1.367313,-0.206794,-1.236117,0.511423,1.688773,-1.496107,1.478768,-1.85001,-0.726623,1.448244,-0.575512,-0.559572,1.680309,-1
259,0.05237,0.524269,-1.081394,-0.206794,-1.251509,1.879574,1.688773,-0.376983,2.230736,-1.85001,1.37623,-0.690491,1.737585,-0.559572,-0.595128,-1
274,3.010353,0.524269,0.777081,6.431745,-1.22831,-0.856729,0.365154,1.577823,6.710044,-1.85001,-0.726623,-0.690491,1.737585,-0.559572,-0.595128,-1



--- PERFIL ESTATÍSTICO DAS ANOMALIAS ---
(Dados após preprocessing - escalados)


Unnamed: 0,TransactionAmount,CustomerAge,TransactionDuration,LoginAttempts,AccountBalance,TransactionHour,DayOfWeek,TimeSinceLastTransaction,AmountToBalanceRatio,TransactionType_Debit,Channel_Branch,Channel_Online,CustomerOccupation_Engineer,CustomerOccupation_Retired,CustomerOccupation_Student
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,1.520344,-0.026865,0.406788,2.298929,-0.439902,0.269984,0.611711,-0.097117,2.498677,-1.05316,0.262955,0.315972,-0.121963,0.13062,0.252584
std,1.64834,1.132602,1.25502,2.994204,0.997352,1.084414,0.998966,1.070179,3.955297,1.138128,1.06005,1.078138,0.927503,1.07988,1.111083
min,-0.997964,-1.499493,-1.55316,-0.206794,-1.2851,-0.856729,-0.958464,-1.722178,-0.389232,-1.85001,-0.726623,-0.690491,-0.575512,-0.559572,-0.595128
25%,0.185932,-1.105984,-0.681107,-0.206794,-1.212262,-0.856729,-0.296655,-1.096556,-0.19335,-1.85001,-0.726623,-0.690491,-0.575512,-0.559572,-0.595128
50%,1.506969,0.018328,0.176651,-0.206794,-1.01727,0.511423,1.026963,0.017668,0.579084,-1.85001,-0.726623,-0.690491,-0.575512,-0.559572,-0.595128
75%,2.962184,0.945886,1.584803,5.601928,0.261645,0.511423,1.688773,0.778939,3.477082,0.540538,1.37623,1.448244,-0.575512,1.78708,1.680309
max,5.253472,1.985874,2.449708,6.431745,2.504372,1.879574,1.688773,1.587329,15.03814,0.540538,1.37623,1.448244,1.737585,1.78708,1.680309



--- PERFIL ESTATÍSTICO DAS TRANSAÇÕES NORMAIS ---
(Dados após preprocessing - escalados)


Unnamed: 0,TransactionAmount,CustomerAge,TransactionDuration,LoginAttempts,AccountBalance,TransactionHour,DayOfWeek,TimeSinceLastTransaction,AmountToBalanceRatio,TransactionType_Debit,Channel_Branch,Channel_Online,CustomerOccupation_Engineer,CustomerOccupation_Retired,CustomerOccupation_Student
count,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0,2461.0
mean,-0.031507,0.000557,-0.00843,-0.047641,0.009116,-0.005595,-0.012677,0.002013,-0.051781,0.021825,-0.005449,-0.006548,0.002527,-0.002707,-0.005234
std,0.957602,0.997522,0.992785,0.852689,0.998411,0.997845,0.996462,0.99883,0.755675,0.985565,0.998416,0.997697,1.001668,0.998536,0.997345
min,-1.018657,-1.499493,-1.567456,-0.206794,-1.285344,-0.856729,-0.958464,-1.731787,-0.390744,-1.85001,-0.726623,-0.690491,-0.575512,-0.559572,-0.595128
25%,-0.743997,-0.993553,-0.809771,-0.206794,-0.918316,-0.856729,-0.958464,-0.875975,-0.355909,0.540538,-0.726623,-0.690491,-0.575512,-0.559572,-0.595128
50%,-0.311639,0.018328,-0.109268,-0.206794,-0.089694,-0.856729,-0.296655,0.017801,-0.291431,0.540538,-0.726623,-0.690491,-0.575512,-0.559572,-0.595128
75%,0.355534,0.805347,0.591234,-0.206794,0.665012,0.511423,1.026963,0.87381,-0.109174,0.540538,1.37623,1.448244,-0.575512,-0.559572,1.680309
max,5.555266,1.985874,2.578372,6.431745,2.529043,1.879574,1.688773,1.691556,9.448147,0.540538,1.37623,1.448244,1.737585,1.78708,1.680309



--- ANÁLISE DAS DIFERENÇAS ENTRE ANOMALIAS E NORMAIS ---
Features mais discriminativas (ordenadas por diferença absoluta nas médias):
AmountToBalanceRatio: 2.550 (anomalias maior)
LoginAttempts: 2.347 (anomalias maior)
TransactionAmount: 1.552 (anomalias maior)
TransactionType_Debit: -1.075 (normais maior)
DayOfWeek: 0.624 (anomalias maior)
AccountBalance: -0.449 (normais maior)
TransactionDuration: 0.415 (anomalias maior)
Channel_Online: 0.323 (anomalias maior)
TransactionHour: 0.276 (anomalias maior)
Channel_Branch: 0.268 (anomalias maior)
