<a href="https://colab.research.google.com/github/felipecampelo/isolationForestAndOptuna/blob/main/Detec%C3%A7%C3%A3oDeAnomaliasComOptuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Testes de Detecção de Anomalias**

**Autor**: Felipe Souto Campelo

**Tema**: Detecção com Isolation Forest 


###**Importando as Bibliotecas**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import joblib

# pip install shap
import shap

# pip install optuna
import optuna

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn import model_selection
from sklearn.metrics import classification_report

import seaborn as sns
        
plt.style.use(style="seaborn")

###**Definindo as Funções**
*   obterAmostra(Porcentagem)
*   stringToNumeric()
*   previsionAndMetrics(modelo, metodo, X_trainOrTest, y_trainOrTest)



In [None]:
def obterAmostra(Porcentagem):
    df = pd.read_csv("/content/drive/MyDrive/TCC Felipe/4- Testes de Detecção/Base de Dados/PS_20174392719_1491204439457_log.csv")
    return df.groupby('isFraud', group_keys=False).apply(lambda x: x.sample(frac=Porcentagem)) #Amostrando os dados com 25% e mantendo a proporção de isFraud

def stringToNumeric():
    data = sampled_data.copy()

    for col in data.columns: #Transformando os valores "String" em "Numérico"
        if data[col].dtype == "object": #Se for String, ele é substituído por número
            le = LabelEncoder()
            data[col].fillna("None", inplace = True)
            le.fit(list(data[col].astype(str).values))
            data[col] = le.transform(list(data[col].astype(str).values))
        else: 
            data[col].fillna(-999, inplace = True)
    
    return data

def previsionAndMetrics(modelo, metodo, trainOrTest, X_trainOrTest, y_trainOrTest):
    y_pred = modelo.predict(X_trainOrTest.values) #Prevendo os valores
    y_pred = mappingPredictions(y_pred)

    print("\nPrecisão do modelo no conjunto de "+trainOrTest+" ("+metodo+"): ", metrics.accuracy_score(y_trainOrTest, y_pred))
    print("F1-SCORE BINÁRIO: ", f1_score(y_trainOrTest, y_pred, average='binary'))
    print("F1-SCORE MACRO: ", f1_score(y_trainOrTest, y_pred, average='macro'))
    print("F1-SCORE MICRO: ", f1_score(y_trainOrTest, y_pred, average='micro'))
    print("F1-SCORE WEIGHTED: ", f1_score(y_trainOrTest, y_pred, average='weighted'))

    score = -(modelo.score_samples(X_trainOrTest.values))
    fpr, tpr, thresholds = metrics.roc_curve(y_trainOrTest, score)
    print("AUC metric: ", metrics.auc(fpr, tpr))
    plt.plot(fpr, tpr)
    
    return y_pred

def mappingPredictions(ypred):
    # Trocando 1 para 0 e -1 para 1
    ypred = [1 if i==-1 else 0 for i in ypred]

    return ypred

###**Obtendo uma amostra dos dados**

In [None]:
sampled_data = obterAmostra(0.01)
sampled_data.drop(['nameOrig','nameDest','isFlaggedFraud'],axis=1,inplace=True)

###**Tranformando String em Numeric Values (IsolationForest exige)**

In [None]:
data = stringToNumeric() 

###**Divisão dos dados em treino e teste**

In [None]:
X = pd.DataFrame(data).copy()
X.pop('isFraud')
y = pd.DataFrame(data['isFraud'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, stratify = y, random_state = 42) #Divisão em treinamento e teste

###**Aplicação da Isolation Forest e do SVM**



In [None]:
# Tunando o One Class SVM

#Step 1. Define an objective function to be maximized.
def objective(trial):

    # Step 2. Setup values for the hyperparameters:
    # kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    kernel = trial.suggest_categorical('kernel', ['rbf'])
    gamma = trial.suggest_loguniform('gamma', 1e-7, 1e7)
    # verbose = trial.suggest_int('verbose', 0, 2)

    svm = OneClassSVM(kernel=kernel, nu=0.01, gamma=gamma)
    
    svm.fit(X_train.values)
    y_pred = svm.predict(X_train)
    y_pred = mappingPredictions(y_pred)

    # Saving the model
    filename = f'./svm_models/model_{trial.number}.joblib'
    joblib.dump(svm, filename)

    # Step 3: Scoring method:
    accuracy = f1_score(y_train, y_pred, average='macro')
    return accuracy

# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Getting the best trial
print(f"The best trial is : {study.best_trial}")

# Getting the best score:
print(f"The best value is : {study.best_value}")

# Getting the best parameters:
print(f"The best parameters are : {study.best_params}")

# Loading the best model
model_svm = joblib.load(f'./svm_models/model_{study.best_trial.number}.joblib')

[32m[I 2022-06-22 04:14:44,099][0m A new study created in memory with name: no-name-e1f6da1f-7407-4031-8f37-efb671f47dd0[0m
X has feature names, but OneClassSVM was fitted without feature names
[32m[I 2022-06-22 04:21:20,992][0m Trial 0 finished with value: 0.21978931143278793 and parameters: {'kernel': 'rbf', 'gamma': 0.06679124765725758}. Best is trial 0 with value: 0.21978931143278793.[0m
X has feature names, but OneClassSVM was fitted without feature names
[32m[I 2022-06-22 04:25:20,987][0m Trial 1 finished with value: 0.3355710862051591 and parameters: {'kernel': 'rbf', 'gamma': 1.3437842761934014e-07}. Best is trial 1 with value: 0.3355710862051591.[0m
X has feature names, but OneClassSVM was fitted without feature names
[32m[I 2022-06-22 04:29:14,969][0m Trial 2 finished with value: 0.3606965550892548 and parameters: {'kernel': 'rbf', 'gamma': 1.2757816456291948e-07}. Best is trial 2 with value: 0.3606965550892548.[0m
X has feature names, but OneClassSVM was fitted w

In [None]:
# Tunando o Isolation Forest

#Step 1. Define an objective function to be maximized.
def objective(trial):
    
    # Step 2. Setup values for the hyperparameters:
    n_estimators = trial.suggest_int('n_estimators', 1, 100)
    # max_samples = trial.suggest_uniform('max_samples', 0.0, 1.0)
    # contamination = trial.suggest_loguniform('contamination', 0.01)
    # max_features = trial.suggest_uniform('max_features', 0.0, 1.0)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    n_jobs = trial.suggest_int('n_jobs', 1, 2)
    verbose = trial.suggest_int('verbose', 0, 2)

    iforest = IsolationForest(n_estimators=n_estimators, contamination=0.01, 
                              bootstrap=bootstrap, n_jobs=n_jobs, verbose=verbose)
    
    iforest.fit(X_train.values)
    y_pred = iforest.predict(X_train)
    y_pred = mappingPredictions(y_pred)

    # Saving the model
    filename = f'./iforest_models/model_{trial.number}.joblib'
    joblib.dump(iforest, filename)

    # Step 3: Scoring method:
    accuracy = f1_score(y_train, y_pred, average='macro')
    return accuracy

# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)

# Getting the best trial
print(f"The best trial is : {study.best_trial}")

# Getting the best score:
print(f"The best value is : {study.best_value}")

# Getting the best parameters:
print(f"The best parameters are : {study.best_params}")

# Loading the best model
model_iforest = joblib.load(f'./iforest_models/model_{study.best_trial.number}.joblib')

###**Previsão e Métricas para IsolationForest e OneClassSVM**

In [None]:
y_pred_train = previsionAndMetrics(model_iforest, 'IsolationForest', 'treino', X_train, y_train)
y_pred = previsionAndMetrics(model_iforest, 'IsolationForest', 'teste', X_test, y_test)
y_pred_train_SVM = previsionAndMetrics(model_svm, 'OneClassSVM', 'treino', X_train, y_train)
y_pred_SVM = previsionAndMetrics(model_svm, 'OneClassSVM', 'teste', X_test, y_test)

###**Plotando a Matriz de Confusão**

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred_SVM)
group_names = ['Verdadeiro Negativo','Falso Posivito','Falso Negativo','Verdadeiro Positivo']
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]

labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)

sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')
plt.show()

y_pred = pd.DataFrame(y_pred)

###**SHAP Values para Isolation Forest**


1.   Criando o explainer e calculando os valores SHAP
2.   Análise dos valores SHAP para a primeira linha da previsão

In [None]:
shap.initjs()

explainer = shap.TreeExplainer(model_iforest) #Criando o objeto que pode calcular os valores SHAP
shap_values = explainer.shap_values(X_test) #Calculando os valores SHAP

print("\n#####  Análise dos valores SHAP  #####")
shap.force_plot(explainer.expected_value, shap_values[1], X_test.columns, matplotlib = True, show = False) #Plotando o gráfico dos valores SHAP para a primeira linha

3.   Tabela de valores SHAP associados a cada classe em porcentagem

In [None]:
print("\n#####  Tabela de valores SHAP associados a cada classe  #####")
for col, vShap in zip(X_test.columns, shap_values[1]): #Imprimindo a contribuição de cada classe para a previsão
    print("===================================================")
    print(col, 'tem valor SHAP associado de: ', 100*(100*vShap.round(2)/50).round(2),'%')

4.   Importância de cada classe para a previsão 


In [None]:
print("\n#####  Importância de cada classe para a previsão  #####")
shap.summary_plot(shap_values, X_test, plot_type="bar") #Importância de cada classe para o resultado da previsão

print("\n#####  Análise do resultado  #####")
shap.summary_plot(shap_values, X_test, plot_type="dot") 

5.   Contribuição de cada classe para a primeira linha de previsão

In [None]:
print("\n#####  Contribuição de cada classe para a primeira linha de dados  #####")
shap.plots._waterfall.waterfall_legacy(expected_value=explainer.expected_value[0], shap_values=shap_values[1].reshape(-1), feature_names = X_test.columns, show=True)

###**SHAP Values para One Class SVM**


1.   Criando o explainer e calculando os valores SHAP (30min)
2.   Análise dos valores SHAP para a primeira linha da previsão

In [None]:
shap.initjs()

# Uso do kmeans para acelerar o processo
X_test_summary = shap.kmeans(X_test, 50)

explainer = shap.KernelExplainer(model_svm.predict, X_test_summary) #Criando o objeto que pode calcular os valores SHAP
shap_values = explainer.shap_values(X_test) #Calculando os valores SHAP

print("\n#####  Análise dos valores SHAP  #####")
shap.force_plot(explainer.expected_value, shap_values[1], X_test.columns, matplotlib = True, show = False) #Plotando o gráfico dos valores SHAP para a primeira linha

3.   Tabela de valores SHAP associados a cada classe em porcentagem

In [None]:
print("\n#####  Tabela de valores SHAP associados a cada classe  #####")
for col, vShap in zip(X_test.columns, shap_values[1]): #Imprimindo a contribuição de cada classe para a previsão
    print("===================================================")
    print(col, 'tem valor SHAP associado de: ', 100*(100*vShap.round(2)/50).round(2),'%')

4.   Importância de cada classe para a previsão 

In [None]:
print("\n#####  Importância de cada classe para a previsão  #####")
shap.summary_plot(shap_values, X_test, plot_type="bar") #Importância de cada classe para o resultado da previsão

print("\n#####  Análise do resultado  #####")
shap.summary_plot(shap_values, X_test, plot_type="dot") 

5.   Contribuição de cada classe para a primeira linha de previsão

In [None]:
print("\n#####  Contribuição de cada classe para a primeira linha de dados  #####")
shap.plots._waterfall.waterfall_legacy(expected_value=explainer.expected_value, shap_values=shap_values[1].reshape(-1), feature_names = X_test.columns, show=True)