In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.metrics import make_scorer, roc_auc_score,confusion_matrix,f1_score,accuracy_score,precision_score,recall_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#The model
from sklearn.ensemble import IsolationForest

#For preprocessing
from sklearn.preprocessing import MinMaxScaler,StandardScaler

# from tqdm import tqdm
from tqdm.notebook import tqdm
import warnings 
warnings.simplefilter("ignore")

### Preprocessing

In [2]:
data=pd.read_csv("/home/dah/anomalie_detection/anomalie_detection/data/creditcard.csv")


In [3]:

data=data.copy(deep=True)
data.drop_duplicates(keep="last",inplace=True)
target=data.Class
data=data.drop(["Class","Time"],axis=1)

In [4]:
scaler = MinMaxScaler()
data_scaled_array = scaler.fit_transform(data)

data = pd.DataFrame(data_scaled_array, columns=data.columns)

### Preparation des données d'entraînement et de test

In [5]:
xtrain,xtest,ytrain,ytest=train_test_split(data,target,test_size=0.15,random_state=42,shuffle=True)

### Définition du modèle

In [31]:
model=IsolationForest(contamination=0.01667,bootstrap=False,n_estimators=500,n_jobs=-1,verbose=1)

### Entraînement du modèle

In [32]:
model.fit(xtrain,ytrain)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    1.1s remaining:    7.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    1.1s finished


### Prédiction du modèle

In [33]:
ypred=model.predict(xtest)

### Transformation des targets
ypred return un vecteur contenant que de -1 pour des données anormales et 1 pour les données normales alors que les targets des <br>
des données initial contiennet des 0 pour des données normales et 1 pour des données anormales. Il faut donc transformer ce vecteur <br>
en un vecteur contenant des 1 pour des données normales et -1 pour les anomalies

In [34]:
def transform_target(target):
    n=target.shape[0]
    for i in range(n):
        if target.iloc[i]==1:
            target.iloc[i]=-1
        else:
            target.iloc[i]=1
    return target


def inverse_transform_target(target):
    n=target.shape[0]
    for i in range(n):
        if target.iloc[i]==-1:
            target.iloc[i]=1
        else:
            target.iloc[i]=0
    return target

### Matrice de confusion

In [35]:
ytest=transform_target(ytest)
confusion=confusion_matrix(ytest,ypred)
print(confusion)

[[  656 41835]
 [   45    23]]


In [36]:
xtest.shape

(42559, 29)

In [38]:
xtest.unique()

AttributeError: 'DataFrame' object has no attribute 'unique'

### Recherche des hypermaramètres du modèle

In [139]:
parameters={
    "n_estimators":[50,100,150,200,250,300,350,400,450,500],
    "contamination":[0.1,0.15,0.175,0.2],
    "max_samples":[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    "max_features":[0.5,0.6,0.7,0.8,0.9,1.0],
    # "bootstrap":[False,True]
}

In [140]:
scoring_metrics = {
    'roc_auc': make_scorer(roc_auc_score),
    'f1_score': make_scorer(f1_score),
    'accuracy_score':make_scorer(accuracy_score)
    # 'precision_score':make_scorer(precision_score),
    # 'recall_score':make_scorer(recall_score)
}

In [141]:
gridsearch=GridSearchCV(estimator=IsolationForest(random_state=42),param_grid=parameters,verbose=3,scoring=scoring_metrics,refit='roc_auc')
gridsearch

In [142]:
ytrain=transform_target(ytrain)

In [None]:
help(GridSearchCV)

In [143]:
n=len(parameters['n_estimators']) * len(parameters['contamination']) * len(parameters['bootstrap']) * len(parameters['max_samples']) * len(parameters['max_features'])
n

KeyError: 'bootstrap'

In [144]:
gridsearch.fit(xtrain, ytrain)

Fitting 5 folds for each of 1680 candidates, totalling 8400 fits
[CV 1/5] END contamination=0.1, max_features=0.5, max_samples=0.4, n_estimators=50; accuracy_score: (test=0.898) f1_score: (test=0.946) roc_auc: (test=0.894) total time=   1.3s
[CV 2/5] END contamination=0.1, max_features=0.5, max_samples=0.4, n_estimators=50; accuracy_score: (test=0.902) f1_score: (test=0.949) roc_auc: (test=0.879) total time=   1.2s
[CV 3/5] END contamination=0.1, max_features=0.5, max_samples=0.4, n_estimators=50; accuracy_score: (test=0.901) f1_score: (test=0.948) roc_auc: (test=0.915) total time=   1.3s
[CV 4/5] END contamination=0.1, max_features=0.5, max_samples=0.4, n_estimators=50; accuracy_score: (test=0.901) f1_score: (test=0.948) roc_auc: (test=0.912) total time=   1.2s
[CV 5/5] END contamination=0.1, max_features=0.5, max_samples=0.4, n_estimators=50; accuracy_score: (test=0.901) f1_score: (test=0.948) roc_auc: (test=0.904) total time=   1.3s
[CV 1/5] END contamination=0.1, max_features=0.5, 

KeyboardInterrupt: 