In [1]:
# importations
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics

import mlflow
from pyngrok import ngrok

from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

In [2]:
# chargement des données
data = pd.read_csv('Outputs/data.csv')

In [3]:
data.isnull().sum()

lib_etendu       0
bilan            0
montant_signe    0
signe            0
num_oper         0
diff_ope_val     0
dtype: int64

# On va maintenant creer un pipeline pour le preprocessing des données et le model

In [7]:
# On va recuperer les colonnes numeriques et categoricielles des donnees

# On recupere les colonnes numeriques
numerical_features = data.select_dtypes(exclude=['object']).columns

# On recupere les colonnes categoricielles
categorical_features = data.select_dtypes(include=['object']).columns

In [8]:
# On va creer une foncion global qui nous permettra de creer un pipeline avec des caracteristiques differentes

def create_pipeline(model = None, transform_for_num = None):

    numerical_pipeline = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        transform_for_num
    )
    # On ne fera plus de transformations sur les variables categoricielles

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, numerical_features)
        ]
    )
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ]
    )
    return pipeline

# On va maintenant creer des experiences avec MLflow afin de determiner le meilleur model

On va etudier differents models :
- Clustering-Based Local Outlier Factor
- Isolation Forest
- One-Class SVM
- K-Means
- Autoencoder

Nous utiliserons un mode d'apprentissage non supervisee pour detecter les anomalies. Donc pour les metrics d'evaluation, nous allons utiliser :
- silhouette_score : mesure la distance entre chaque point de données et les points de données de son cluster voisin le plus proche par rapport à la distance moyenne de tous les points de données dans le cluster. Un score élevé indique une bonne séparation des clusters (varie entre 0 et 1).
- calinski_harabasz_score :  mesure la séparation entre les clusters. Plus la variance inter-cluster est grande par rapport à la variance intra-cluster, plus le score est élevé, ce qui indique que les clusters sont bien séparés.
- davies_bouldin_score : est calculé en mesurant la distance entre chaque paire de clusters et en comparant cette distance à la somme des rayons des deux clusters. Un score plus faible indique que les clusters sont plus compacts et séparés.

In [9]:
import warnings
warnings.filterwarnings('ignore')


# On va commencer par le model K-Means

In [10]:
mlflow.sklearn.autolog()

In [11]:
# On va definir l'experience du K-Means

mlflow.set_experiment("K-Means_experiment")

<Experiment: artifact_location='file:///D:/Documents/Stage%20PFE/Projet/Codes/BenchmarkCodes/mlruns/529053560302357396', creation_time=1681993442450, experiment_id='529053560302357396', last_update_time=1681993442450, lifecycle_stage='active', name='K-Means_experiment', tags={}>

In [7]:
with mlflow.start_run(run_name="default_K-Means_with_StandardScaler") as run:
    # On va creer un pipeline avec le model K-Means
    pipeline = create_pipeline(model = KMeans(n_clusters=2), transform_for_num = StandardScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("inertia", pipeline['model'].inertia_)
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`


In [8]:
with mlflow.start_run(run_name="default_K-Means_with_MinMaxScaler") as run:
    # On va creer un pipeline avec le model K-Means
    pipeline = create_pipeline(model = KMeans(n_clusters=2), transform_for_num = MinMaxScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("inertia", pipeline['model'].inertia_)
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`


In [9]:
with mlflow.start_run(run_name="default_K-Means_with_RobustScaler") as run:
    # On va creer un pipeline avec le model K-Means
    pipeline = create_pipeline(model = KMeans(n_clusters=2), transform_for_num = RobustScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("inertia", pipeline['model'].inertia_)
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`


Le StandardScaler donne les meilleurs resultats donc on va continuer avec ce scaler

In [12]:
inits = ['k-means++', 'random']
max_iters = [200, 500, 1000]
algorithms = ['lloyd', 'elkan']
tolerances = [1e-4, 1e-3, 1e-2, 1e-1]

for init in inits:
    for max_iter in max_iters:
        for algorithm in algorithms:
            for tolerance in tolerances:
                with mlflow.start_run(run_name=f"K-Means_with_{init}_init_{max_iter}_max_iter_{algorithm}_algorithm_{tolerance}_tolerance") as run:
                    # On va creer un pipeline avec le model K-Means
                    pipeline = create_pipeline(model = KMeans(n_clusters=2, init=init, max_iter=max_iter, algorithm=algorithm, tol=tolerance), transform_for_num = StandardScaler())

                    # On va entrainer le model
                    pipeline.fit(data)

                    # On va recuperer les predictions
                    predictions = pipeline.predict(data)

                    # On va ajouter les metrics
                    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
                    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
                    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`
                                 Pipeline(steps=[('simpleimputer',
                                             

# Conlusion K-Means:
Le meilleur model est K-Means avec :
- init = k-means++
- max_iter = 500
- algorithm = lloyd
- tolerance = 0.01

# On passe au Isolation Forest

In [13]:
# On va definir l'experience du Isolation Forest

mlflow.set_experiment("Iforest_experiment")

<Experiment: artifact_location='file:///D:/Documents/Stage%20PFE/Projet/Codes/BenchmarkCodes/mlruns/417521680502521761', creation_time=1682022805315, experiment_id='417521680502521761', last_update_time=1682022805315, lifecycle_stage='active', name='Iforest_experiment', tags={}>

In [35]:
with mlflow.start_run(run_name="default_Iforest_with_StandardScaler") as run:
    # On va creer un pipeline avec le model Isolation Forest
    pipeline = create_pipeline(model = IsolationForest(), transform_for_num = StandardScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`


In [36]:
with mlflow.start_run(run_name="default_Iforest_with_MinMaxScaler") as run:
    # On va creer un pipeline avec le model Isolation Forest
    pipeline = create_pipeline(model = IsolationForest(), transform_for_num = MinMaxScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`


In [37]:
with mlflow.start_run(run_name="default_Iforest_with_RobustScaler") as run:
    # On va creer un pipeline avec le model Isolation Forest
    pipeline = create_pipeline(model = IsolationForest(), transform_for_num = RobustScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`


Le iForest par default avec MinMaxScaler donne les meilleurs resultats. Donc on va continer uniquement avec cet scaler

In [14]:
n_estimators = [200, 300, 400, 500, 1000]
max_samples = ['auto', 0.1, 0.01, 0.05, 0.001]
contaminations = ['auto', 0.1, 0.01, 0.05]

for n_estimator in n_estimators:
    for max_sample in max_samples:
        for contamination in contaminations:
            with mlflow.start_run(run_name=f"IsolationForest_with_{n_estimator}_n_estimator_{max_sample}_max_sample_{contamination}_contamination") as run:
                # On va creer un pipeline avec le model Isolation Forest
                pipeline = create_pipeline(model = IsolationForest(n_estimators=n_estimator, max_samples=max_sample, contamination=contamination), transform_for_num = MinMaxScaler())

                # On va entrainer le model
                pipeline.fit(data)

                # On va recuperer les predictions
                predictions = pipeline.predict(data)

                # On va ajouter les metrics
                mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
                mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
                mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

            mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`
                                 Pipeline(steps=[('simpleimputer',
                                             

# Conclusion IsolationForest
Le meilleur model de l'IForest est avec :
- n_estimators = 300
- max_samples = 0.001
- contamination = 0.01

# On passe au CBLOF

In [11]:
# On va definir l'experience du CBLOF

mlflow.set_experiment("CBLOF_experiment")

<Experiment: artifact_location='file:///D:/Documents/Stage%20PFE/Projet/Codes/BenchmarkCodes/mlruns/714955314991463918', creation_time=1682024738587, experiment_id='714955314991463918', last_update_time=1682024738587, lifecycle_stage='active', name='CBLOF_experiment', tags={}>

In [59]:
with mlflow.start_run(run_name="default_LOF_with_StandardScaler") as run:
    # On va creer un pipeline avec le model CBLOF
    pipeline = create_pipeline(model = LocalOutlierFactor(), transform_for_num = StandardScaler())

    # On va entrainer le model
    predictions = pipeline.fit_predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`


In [60]:
with mlflow.start_run(run_name="default_LOF_with_MinMaxScaler") as run:
    # On va creer un pipeline avec le model CBLOF
    pipeline = create_pipeline(model = LocalOutlierFactor(), transform_for_num = MinMaxScaler())

    # On va entrainer le model
    predictions = pipeline.fit_predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`


In [61]:
with mlflow.start_run(run_name="default_LOF_with_RobustScaler") as run:
    # On va creer un pipeline avec le model CBLOF
    pipeline = create_pipeline(model = LocalOutlierFactor(), transform_for_num = RobustScaler())

    # On va entrainer le model
    predictions = pipeline.fit_predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`


Pour le LOF, le meilleur scaler est le RobustScaler. On va donc continuer avec ce scaler

In [14]:
n_neighbors = [20, 25, 30, 35, 40]
algorithms = ['kd_tree', 'brute', 'auto', 'ball_tree']
leaf_sizes = [20, 30, 40, 50]
contaminations = ['auto', 0.05, 0.1, 0.001]

for n_neighbor in n_neighbors:
    for algorithm in algorithms:
        for leaf_size in leaf_sizes:
            for contamination in contaminations:
                with mlflow.start_run(run_name=f"LOF_with_{n_neighbor}_n_neighbor_{algorithm}_algorithm_{leaf_size}_leaf_size_{contamination}_contamination") as run:
                    # On va creer un pipeline avec le model Isolation Forest
                    pipeline = create_pipeline(model = LocalOutlierFactor(n_neighbors=n_neighbor, algorithm=algorithm, leaf_size=leaf_size, contamination=contamination), transform_for_num = RobustScaler())
                    '''
                    # On va entrainer le model
                    pipeline.fit(data)
                    '''
                    # On va recuperer les predictions
                    predictions = pipeline.fit_predict(data)

                    # On va ajouter les metrics
                    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
                    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
                    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`
                                 Pipeline(steps=[('simpleimputer',
                                             

# Conclusion LOF
Le meilleur model est avec :
- n_neighbors = 40
- algorithm = kd_tree
- leaf_size = 50
- contamination = 0.05

# On passe au One-Class SVM

In [11]:
# On va definir l'experience du One-Class SVM

mlflow.set_experiment("OneClassSVM_experiment")

<Experiment: artifact_location='file:///D:/Documents/Stage%20PFE/Projet/Codes/BenchmarkCodes/mlruns/701905565425256705', creation_time=1682039896310, experiment_id='701905565425256705', last_update_time=1682039896310, lifecycle_stage='active', name='OneClassSVM_experiment', tags={}>

In [14]:
with mlflow.start_run(run_name="default_OneClassSVM_with_StandardScaler") as run:
    # On va creer un pipeline avec le model One-Class SVM
    pipeline = create_pipeline(model = OneClassSVM(), transform_for_num = StandardScaler())

    # On va entrainer le model
    pipeline.fit(data)

    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`


In [83]:
with mlflow.start_run(run_name="default_OneClassSVM_with_MinMaxScaler") as run:
    # On va creer un pipeline avec le model One-Class SVM
    pipeline = create_pipeline(model = OneClassSVM(), transform_for_num = MinMaxScaler())

    # On va entrainer le model
    pipeline.fit(data)

    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`


In [12]:
with mlflow.start_run(run_name="default_OneClassSVM_with_RobustScaler2") as run:
    # On va creer un pipeline avec le model One-Class SVM
    pipeline = create_pipeline(model = OneClassSVM(), transform_for_num = RobustScaler())

    # On va entrainer le model
    pipeline.fit(data)

    predictions = pipeline.predict(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`


On va continuer avec StandardScaler

In [15]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
gammas = ['scale', 'auto']
nus = [0.1, 0.3, 0.5, 0.7, 0.9]

for kernel in kernels:
    for gamma in gammas:
        for nu in nus:
            with mlflow.start_run(run_name=f"kernel_{kernel}_gamma_{gamma}_nu_{nu}_with_StandardScaler") as run:
                # On va creer un pipeline avec le model One-Class SVM
                pipeline = create_pipeline(model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu), transform_for_num = StandardScaler())

                # On va entrainer le model
                pipeline.fit(data)

                predictions = pipeline.predict(data)

                # On va ajouter les metrics
                mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
                mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
                mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`


KeyboardInterrupt: 

Le meilleur model est avec :
- kernel = linear
- gamma = auto
- nu = 0.9

# On passe au model autoencoder

In [None]:
# On doit tout d'abord creer notre model autoencoder mais de sorte a ce qu'il soit compatible avec scikit-learn

class Autoencoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoding_dim=2, epochs=50, batch_size=32, validation_split=0.1, verbose=0):
        self.encoding_dim = encoding_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.validation_split = validation_split
        self.verbose = verbose

    def fit(self, X, y=None):
        # On va definir le model
        input_dim = X.shape[1]
        input_layer = Input(shape=(input_dim, ))
        encoder = Dense(self.encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(10e-5))(input_layer)
        encoder = Dense(int(self.encoding_dim / 2), activation="relu")(encoder)
        decoder = Dense(int(self.encoding_dim / 2), activation='tanh')(encoder)
        decoder = Dense(input_dim, activation='relu')(decoder)
        autoencoder = Model(inputs=input_layer, outputs=decoder)

        # On va compiler le model
        autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

        # On va entrainer le model
        autoencoder.fit(X, X, epochs=self.epochs, batch_size=self.batch_size, validation_split=self.validation_split, verbose=self.verbose)

        # On va sauvegarder le model
        self.model = autoencoder

        return self

    def transform(self, X, y=None):
        # On va predire les donnees
        predictions = self.model.predict(X)

        # On va calculer la distance entre les donnees et les predictions
        distances = np.linalg.norm(X - predictions, axis=1)

        return distances

# Voir les resultats sur le tableau de bord MLflow en utilisant nyngrok

In [4]:
# On va creer un tunnel pour acceder au tableau de bord MLflow
ngrok.kill()

NGROK_AUTH_TOKEN = "2OgxmHRxos2U37s8DO5FlG7Pn3D_3ih56ethLcFPMdEhTBxnS"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

ngrok_tunnel = ngrok.connect(addr='5000', proto='http', bind_tls=True)
print('MLflow Tracking UI: ', ngrok_tunnel.public_url)

t=2023-04-21T11:41:37+0000 lvl=warn msg="ngrok config file found at both XDG and legacy locations, using XDG location" xdg_path=C:\\Users\\ADMIN\\AppData\\Local/ngrok/ngrok.yml legacy_path=C:\\Users\\ADMIN\\.ngrok2\\ngrok.yml


MLflow Tracking UI:  https://81c2-41-92-53-189.ngrok-free.app


In [5]:
!mlflow ui

^C
