In [2]:
# importations
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics

import mlflow
from pyngrok import ngrok

from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

In [41]:
# chargement des données
data = pd.read_csv('Outputs/data.csv')

In [42]:
data.drop(['numero_compte'], axis=1, inplace=True)

In [29]:
data.isnull().sum()

lib_etendu             0
bilan                  0
montant_signe          0
signe                  0
diff_ope_val           0
diff_ope_val_signe     0
transactions_counts    0
dtype: int64

# On va maintenant creer un pipeline pour le preprocessing des données et le model

In [30]:
# On va recuperer les colonnes numeriques et categoricielles des donnees

# On recupere les colonnes numeriques
numerical_features = data.select_dtypes(exclude=['object']).columns

# On recupere les colonnes categoricielles
categorical_features = data.select_dtypes(include=['object']).columns

In [31]:
print(numerical_features)
print(categorical_features)

Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')
Index(['lib_etendu', 'signe', 'diff_ope_val_signe'], dtype='object')


# On a eu des difficultes a transformer les variables categoricielles dans le pipeline. Donc on va le faire manuellement avec une fonction

In [32]:
def encode_categorical_features(data):
    for feature in categorical_features:
        data[feature] = LabelEncoder().fit_transform(data[feature])
    return data


In [33]:
# On va creer une foncion global qui nous permettra de creer un pipeline avec des caracteristiques differentes

# Créer une fonction de transformation pour les variables catégorielles
cat_transformer = FunctionTransformer(encode_categorical_features)

# Creer un preprocessor
def create_preprocessor(transform_for_num = None):
    numerical_pipeline = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        transform_for_num
    )

    # On ne fera plus de transformations sur les variables categoricielles

    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, data.select_dtypes(exclude=['object']).columns),
        ('cat', cat_transformer, data.select_dtypes(include=['object']).columns)
        ]
    )
    return preprocessor

def create_pipeline(model = None, transform_for_num = None):

    preprocessor = create_preprocessor(transform_for_num)
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ]
    )
    return pipeline

# On va maintenant creer des experiences avec MLflow afin de determiner le meilleur model

On va etudier differents models :
- Clustering-Based Local Outlier Factor
- Isolation Forest
- One-Class SVM
- K-Means
- Autoencoder (Pas possible)

Nous utiliserons un mode d'apprentissage non supervisee pour detecter les anomalies. Donc pour les metrics d'evaluation, nous allons utiliser :
- silhouette_score : mesure la distance entre chaque point de données et les points de données de son cluster voisin le plus proche par rapport à la distance moyenne de tous les points de données dans le cluster. Un score élevé indique une bonne séparation des clusters (varie entre 0 et 1).
- calinski_harabasz_score :  mesure la séparation entre les clusters. Plus la variance inter-cluster est grande par rapport à la variance intra-cluster, plus le score est élevé, ce qui indique que les clusters sont bien séparés.
- davies_bouldin_score : est calculé en mesurant la distance entre chaque paire de clusters et en comparant cette distance à la somme des rayons des deux clusters. Un score plus faible indique que les clusters sont plus compacts et séparés.

In [34]:
import warnings
warnings.filterwarnings('ignore')

# On va commencer par le model K-Means

In [35]:
mlflow.sklearn.autolog()

In [36]:
# On va definir l'experience du K-Means

mlflow.set_experiment("K-Means_experiment")

<Experiment: artifact_location='file:///D:/Documents/Stage%20PFE/Projet/Codes/BenchmarkCodes/mlruns/107447297481619876', creation_time=1682335746082, experiment_id='107447297481619876', last_update_time=1682335746082, lifecycle_stage='active', name='K-Means_experiment', tags={}>

In [37]:
with mlflow.start_run(run_name="default_K-Means_with_MinMaxScaler") as run:
    # On va creer un pipeline avec le model K-Means
    pipeline = create_pipeline(model = KMeans(n_clusters=2), transform_for_num = MinMaxScaler())

    # On va creer un preprocessor
    preprocessor = create_preprocessor(transform_for_num = MinMaxScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    # On va transformer data
    data = preprocessor.fit_transform(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`


In [43]:
with mlflow.start_run(run_name="default_K-Means_with_RobustScaler") as run:
    # On va creer un pipeline avec le model K-Means
    pipeline = create_pipeline(model = KMeans(n_clusters=2), transform_for_num = RobustScaler())

    # On va creer un preprocessor
    preprocessor = create_preprocessor(transform_for_num = RobustScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    # On va transformer data
    data2 = preprocessor.fit_transform(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`


On va garder le minmax scaler pour le K-Means

In [45]:
inits = ['k-means++', 'random']
max_iters = [200, 500, 1000]
algorithms = ['lloyd', 'elkan']
tolerances = [1e-4, 1e-3, 1e-2, 1e-1]

# On va creer un preprocessor
preprocessor = create_preprocessor(transform_for_num = MinMaxScaler())


for init in inits:
    for max_iter in max_iters:
        for algorithm in algorithms:
            for tolerance in tolerances:
                with mlflow.start_run(run_name=f"K-Means_with_{init}_init_{max_iter}_max_iter_{algorithm}_algorithm_{tolerance}_tolerance") as run:

                    # On va creer un pipeline avec le model K-Means
                    pipeline = create_pipeline(model = KMeans(n_clusters=2, init=init, max_iter=max_iter, algorithm=algorithm, tol=tolerance), transform_for_num = MinMaxScaler())

                    # On va entrainer le model
                    pipeline.fit(data)

                    # On va recuperer les predictions
                    predictions = pipeline.predict(data)

                    # On va transformer data
                    data2 = preprocessor.fit_transform(data)

                    # On va ajouter les metrics
                    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
                    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
                    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

                mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`
                                 Pipeline(steps=[('simpleimputer',
                           

# Conlusion K-Means:
Le meilleur model est K-Means normal avec les parametres par default. Meme en les faisant varier ca n'augmente pas forcement les metriques

# On passe au Isolation Forest

In [46]:
# On va definir l'experience du Isolation Forest

mlflow.set_experiment("Iforest_experiment")

2023/04/24 13:44:33 INFO mlflow.tracking.fluent: Experiment with name 'Iforest_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///D:/Documents/Stage%20PFE/Projet/Codes/BenchmarkCodes/mlruns/893774456322062994', creation_time=1682340273503, experiment_id='893774456322062994', last_update_time=1682340273503, lifecycle_stage='active', name='Iforest_experiment', tags={}>

In [47]:
# On va creer un preprocessor
preprocessor = create_preprocessor(transform_for_num = MinMaxScaler())

with mlflow.start_run(run_name="default_Iforest_with_MinMaxScaler") as run:
    # On va creer un pipeline avec le model Isolation Forest
    pipeline = create_pipeline(model = IsolationForest(), transform_for_num = MinMaxScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    data2 = preprocessor.fit_transform(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`


In [48]:
# On va creer un preprocessor
preprocessor = create_preprocessor(transform_for_num = RobustScaler())

with mlflow.start_run(run_name="default_Iforest_with_RobustScaler") as run:
    # On va creer un pipeline avec le model Isolation Forest
    pipeline = create_pipeline(model = IsolationForest(), transform_for_num = RobustScaler())

    # On va entrainer le model
    pipeline.fit(data)

    # On va recuperer les predictions
    predictions = pipeline.predict(data)

    data2 = preprocessor.fit_transform(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`


Pour le Isolation Forest, le meilleur scaler est le RobustScaler

In [49]:
n_estimators = [200, 300, 400, 500, 1000]
max_samples = ['auto', 0.1, 0.01, 0.05, 0.001]
contaminations = ['auto', 0.1, 0.01, 0.05]

# On va creer un preprocessor
preprocessor = create_preprocessor(transform_for_num = RobustScaler())

for n_estimator in n_estimators:
    for max_sample in max_samples:
        for contamination in contaminations:
            with mlflow.start_run(run_name=f"IsolationForest_with_{n_estimator}_n_estimator_{max_sample}_max_sample_{contamination}_contamination") as run:
                # On va creer un pipeline avec le model Isolation Forest
                pipeline = create_pipeline(model = IsolationForest(n_estimators=n_estimator, max_samples=max_sample, contamination=contamination), transform_for_num = RobustScaler())

                # On va entrainer le model
                pipeline.fit(data)

                # On va recuperer les predictions
                predictions = pipeline.predict(data)

                data2 = preprocessor.fit_transform(data)

                # On va ajouter les metrics
                mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
                mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
                mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

            mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`
                                 Pipeline(steps=[('simpleimputer',
                           

# Conclusion IsolationForest
Le meilleur model de l'IForest est avec :
- n_estimators = 300
- max_samples = 0.001
- contamination = 0.01

# On passe au CBLOF

In [50]:
# On va definir l'experience du CBLOF

mlflow.set_experiment("CBLOF_experiment")

2023/04/24 14:39:39 INFO mlflow.tracking.fluent: Experiment with name 'CBLOF_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///D:/Documents/Stage%20PFE/Projet/Codes/BenchmarkCodes/mlruns/952954466865625538', creation_time=1682343579923, experiment_id='952954466865625538', last_update_time=1682343579923, lifecycle_stage='active', name='CBLOF_experiment', tags={}>

In [51]:
# On va creer un preprocessor
preprocessor = create_preprocessor(transform_for_num = MinMaxScaler())

with mlflow.start_run(run_name="default_LOF_with_MinMaxScaler") as run:
    # On va creer un pipeline avec le model CBLOF
    pipeline = create_pipeline(model = LocalOutlierFactor(), transform_for_num = MinMaxScaler())

    # On va entrainer le model
    predictions = pipeline.fit_predict(data)

    data2 = preprocessor.fit_transform(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`


In [52]:
# On va creer un preprocessor
preprocessor = create_preprocessor(transform_for_num = RobustScaler())

with mlflow.start_run(run_name="default_LOF_with_RobustScaler") as run:
    # On va creer un pipeline avec le model CBLOF
    pipeline = create_pipeline(model = LocalOutlierFactor(), transform_for_num = RobustScaler())

    # On va entrainer le model
    predictions = pipeline.fit_predict(data)

    data2 = preprocessor.fit_transform(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`


Pour le LOF, le meilleur scaler est le MinMaxScaler

In [14]:
n_neighbors = [20, 25, 30, 35, 40]
algorithms = ['kd_tree', 'brute', 'auto', 'ball_tree']
leaf_sizes = [20, 30, 40, 50]
contaminations = ['auto', 0.05, 0.1, 0.001]

for n_neighbor in n_neighbors:
    for algorithm in algorithms:
        for leaf_size in leaf_sizes:
            for contamination in contaminations:
                with mlflow.start_run(run_name=f"LOF_with_{n_neighbor}_n_neighbor_{algorithm}_algorithm_{leaf_size}_leaf_size_{contamination}_contamination") as run:
                    # On va creer un pipeline avec le model Isolation Forest
                    pipeline = create_pipeline(model = LocalOutlierFactor(n_neighbors=n_neighbor, algorithm=algorithm, leaf_size=leaf_size, contamination=contamination), transform_for_num = RobustScaler())
                    '''
                    # On va entrainer le model
                    pipeline.fit(data)
                    '''
                    # On va recuperer les predictions
                    predictions = pipeline.fit_predict(data)

                    # On va ajouter les metrics
                    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
                    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
                    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='object'...`
                                 Pipeline(steps=[('simpleimputer',
                                             

# Conclusion LOF
Le meilleur model est avec :
- n_neighbors = 40
- algorithm = kd_tree
- leaf_size = 50
- contamination = 0.05

# On passe au One-Class SVM

In [54]:
# On va definir l'experience du One-Class SVM

mlflow.set_experiment("OneClassSVM_experiment")

2023/04/24 14:41:57 INFO mlflow.tracking.fluent: Experiment with name 'OneClassSVM_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///D:/Documents/Stage%20PFE/Projet/Codes/BenchmarkCodes/mlruns/283473541032408725', creation_time=1682343717233, experiment_id='283473541032408725', last_update_time=1682343717233, lifecycle_stage='active', name='OneClassSVM_experiment', tags={}>

In [55]:
# On va creer un preprocessor
preprocessor = create_preprocessor(transform_for_num = MinMaxScaler())

with mlflow.start_run(run_name="default_OneClassSVM_with_MinMaxScaler") as run:
    # On va creer un pipeline avec le model One-Class SVM
    pipeline = create_pipeline(model = OneClassSVM(), transform_for_num = MinMaxScaler())

    # On va entrainer le model
    pipeline.fit(data)

    predictions = pipeline.predict(data)

    data2 = preprocessor.fit_transform(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('minmaxscaler',
                                                  MinMaxScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`


In [56]:
# On va creer un preprocessor
preprocessor = create_preprocessor(transform_for_num = RobustScaler())

with mlflow.start_run(run_name="default_OneClassSVM_with_RobustScaler2") as run:
    # On va creer un pipeline avec le model One-Class SVM
    pipeline = create_pipeline(model = OneClassSVM(), transform_for_num = RobustScaler())

    # On va entrainer le model
    pipeline.fit(data)

    predictions = pipeline.predict(data)

    data2 = preprocessor.fit_transform(data)

    # On va ajouter les metrics
    mlflow.log_metric("silhouette_score", metrics.silhouette_score(data2, predictions))
    mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data2, predictions))
    mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data2, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                     ...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['bilan', 'montant_signe', 'diff_ope_val', 'transactions_counts'], dtype='object')),
                                ('cat',...`


On va continuer avec MinMaxScaler

In [15]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
gammas = ['scale', 'auto']
nus = [0.1, 0.3, 0.5, 0.7, 0.9]

for kernel in kernels:
    for gamma in gammas:
        for nu in nus:
            with mlflow.start_run(run_name=f"kernel_{kernel}_gamma_{gamma}_nu_{nu}_with_StandardScaler") as run:
                # On va creer un pipeline avec le model One-Class SVM
                pipeline = create_pipeline(model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu), transform_for_num = RobustScaler())

                # On va entrainer le model
                pipeline.fit(data)

                predictions = pipeline.predict(data)

                # On va ajouter les metrics
                mlflow.log_metric("silhouette_score", metrics.silhouette_score(data, predictions))
                mlflow.log_metric("calinski_harabasz_score", metrics.calinski_harabasz_score(data, predictions))
                mlflow.log_metric("davies_bouldin_score", metrics.davies_bouldin_score(data, predictions))

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['lib_etendu', 'bilan', 'montant_signe', 'signe', 'num_oper',
       'diff_ope_val'],
      dtype='obj...`


KeyboardInterrupt: 

Le meilleur model est avec :
- kernel = linear
- gamma = auto
- nu = 0.9

# Voir les resultats sur le tableau de bord MLflow en utilisant nyngrok

In [3]:
# On va creer un tunnel pour acceder au tableau de bord MLflow
ngrok.kill()

NGROK_AUTH_TOKEN = "2OgxmHRxos2U37s8DO5FlG7Pn3D_3ih56ethLcFPMdEhTBxnS"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

ngrok_tunnel = ngrok.connect(addr='5000', proto='http', bind_tls=True)
print('MLflow Tracking UI: ', ngrok_tunnel.public_url)

t=2023-04-24T17:52:42+0100 lvl=warn msg="ngrok config file found at both XDG and legacy locations, using XDG location" xdg_path=C:\\Users\\ADMIN\\AppData\\Local/ngrok/ngrok.yml legacy_path=C:\\Users\\ADMIN\\.ngrok2\\ngrok.yml


MLflow Tracking UI:  https://88a8-105-66-5-38.ngrok-free.app


In [4]:
!mlflow ui

^C


# On va essayer une autre approche.
L'idee est d'essayer de labeleliser nos donnees, c'est a dire detecter les outliers et les inliers en faisant des predictions basiques avec nos 4 modeles. L'idee est qu'ensuite on puisse avoir une idee assez claire des anomalies pour pouvoir par la suite essayer de confirmer ou non les performances d'un modele