<div align=center>
<h1>Comment démasquer les fraudeurs ?</h1>
<h2>par BNP Paribas PF</h2>
<h3>MICHELON François</h3>
<h3>ZABLIT Charles</h3>
</div>


# 0. Configuration


## 0.1 Imports de modules


In [None]:
import logging
import pathlib
import re
from typing import Tuple, List
from collections import Counter

import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    GridSearchCV,
    train_test_split,
    StratifiedShuffleSplit,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import scale, StandardScaler
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer

## 0.2 Configuration du logger


In [None]:
# Initialize logger
logging.basicConfig(level=logging.INFO)

## 0.3 Configuration des chemins de fichiers


In [None]:
X_train_file = pathlib.Path("data/X_train_G3tdtEn.csv")
y_train_file = pathlib.Path("data/Y_train_2_XPXJDyy.csv")
X_test_file = pathlib.Path("data/X_test_8skS2ey.csv")

# 1. Analyse des données


## 1.1 Chargement des données


In [None]:
def load_train_df(path: pathlib.Path) -> pd.DataFrame:
    mixed_columns = (
        ["item" + str(i) for i in range(1, 25)]
        + ["make" + str(i) for i in range(1, 25)]
        + ["model" + str(i) for i in range(1, 25)]
        + ["goods_code" + str(i) for i in range(1, 25)]
    )
    dtype = {col: str for col in mixed_columns}
    return pd.read_csv(path, dtype=dtype).drop(columns=["ID"])


def load_test_df(path: pathlib.Path) -> pd.Series:
    return pd.read_csv(path)["fraud_flag"]

In [None]:
X_train_df = load_train_df(X_train_file)
y_train_df = load_test_df(y_train_file)
X_test_df = load_train_df(X_test_file)

In [None]:
X_train_df.head()

In [None]:
X_train_df.describe()

## 1.2 Étude de la parcimonie des données


In [None]:
def sparse_coeff(X: pd.DataFrame) -> float:
    nb_sparse = X_train_df.isnull().sum().sum()
    nb_total = X_train_df.shape[0] * X_train_df.shape[1]
    return nb_sparse / nb_total


print(f"{sparse_coeff(X_train_df):.2%} des données sont nulles.")

In [None]:
X_train_df.notnull().sum()[[f"item{i}" for i in range(1, 25)]].plot.pie(
    autopct="%1.0f%%",
)
plt.xticks(rotation=45)
plt.title("Répartition des items non nuls");

92% des commandes ont moins de 3 items.


## 1.3 Étude des tendances


In [None]:
X_train_df[y_train_df == 0].notnull().sum()[
    [f"item{i}" for i in range(5, 25)]
].plot.bar()
plt.ylabel("Nombre de valeurs non nulles")
plt.xlabel("Colonne")
plt.xticks(rotation=45);

In [None]:
X_train_df[y_train_df == 1].notnull().sum()[
    [f"item{i}" for i in range(5, 25)]
].plot.bar()
plt.ylabel("Nombre de valeurs non nulles")
plt.xlabel("Colonne")
plt.xticks(rotation=45);

En regardant pour les colonnes items > 5, fraude+non fraude et fraude, on n'observe pas de tendance sur le nombre d'items par panier entre les commandes frauduleuses et les non frauduleuses.

---


In [None]:
X_train_df[(y_train_df == 0) & (X_train_df["Nb_of_items"] < 25)][
    "Nb_of_items"
].plot.hist(bins=25)
plt.ylabel("Nombre de commandes")
plt.xlabel("Number_of_items")
plt.xticks(rotation=45);

In [None]:
X_train_df[y_train_df == 1]["Nb_of_items"].plot.hist(bins=25)
plt.ylabel("Nombre de commandes")
plt.xlabel("Number_of_items")
plt.xticks(rotation=45);

Là encore, on ne distingue pas de tendance significatives sur la distribution du nombre d'items par panier.

---


In [None]:
def total_price(X: pd.DataFrame) -> pd.Series:
    return X[[f"cash_price{i}" for i in range(1, 25)]].sum(axis=1)


def plot_total_price_hist(X: pd.DataFrame, y: pd.DataFrame, fraud: bool) -> None:
    if fraud:
        total_price_df = total_price(X[(total_price(X) < 10000) & (y == 1)])
    else:
        total_price_df = total_price(X[(total_price(X) < 10000) & (y == 0)])
    total_price_df.plot.hist(bins=25)
    mean = total_price_df.mean()
    std = total_price_df.std()
    plt.xlim(0, 10000)
    plt.axvline(
        x=mean,
        color="red",
    )
    plt.text(mean, 0, f"Moyenne = {mean:.0f}", rotation=90)
    plt.axvline(
        x=std,
        color="green",
    )
    plt.text(std, 0, f"Ecart type = {std:.0f}", rotation=90)
    plt.ylabel("Nombre de commandes")
    plt.xlabel("Prix total de la commande")


plot_total_price_hist(X_train_df, y_train_df, fraud=False)

In [None]:
plot_total_price_hist(X_train_df, y_train_df, fraud=True)

On observe une légère augmentation de la moyenne et de l'écart type du prix total des commandes pour les fraudes. Nous allons essayer d'utiliser ces informations dans nos features.


## 1.4 Étude de la distribution des classes


In [None]:
y_train_df.value_counts().plot.pie(
    autopct="%1.0f%%",
)
plt.legend(["Non frauduleux", "Frauduleux"])
plt.ylabel("");

C'est un jeux de données de détection de fraudes, donc la distribution frauduleux/non frauduleux est très inégale.


# 2. Feature engineering


## 2.1 Création de nouvelles features


In [None]:
class FeaturesExtractor(BaseEstimator, TransformerMixin):
    _counter: Counter

    def fit(self, X, y=None):
        all_goods = []
        for _, row in X.iterrows():
            for i in range(1, 25):
                all_goods.append(row[f"goods_code{i}"])
        self._counter = Counter(all_goods)
        return self

    def transform(self, X: pd.DataFrame, y=None):
        df = X
        # Create a column with the total price of the purchase.
        df["total_price"] = df[[f"cash_price{i}" for i in range(1, 25)]].sum(axis=1)
        # Create a column with the average price of the items in the purchase.
        df["avg_item_price"] = df["total_price"] / df["Nb_of_items"]
        # Create a column with the maximum price of the items in the purchase.
        df["max_item_price"] = df[[f"cash_price{i}" for i in range(1, 25)]].max(axis=1)
        # Create a column with the minimum price of the items in the purchase.
        df["min_item_price"] = df[[f"cash_price{i}" for i in range(1, 25)]].min(axis=1)
        # Create a column with the variance of the prices of the items in the purchase.
        df["price_variance"] = df[[f"cash_price{i}" for i in range(1, 25)]].var(axis=1)
        # Create a column with the number of products in the purchase.
        df["product_count"] = df[[f"Nbr_of_prod_purchas{i}" for i in range(1, 25)]].sum(
            axis=1
        )
        # Create a column with the number of unique manufacturers in the purchase.
        df["nb_unique_makes"] = df[[f"make{i}" for i in range(1, 25)]].nunique(axis=1)
        # Create a column with the number of unique items in the purchase.
        df["nb_unique_models"] = df[[f"model{i}" for i in range(1, 25)]].nunique(axis=1)
        # Create a column of the ratio of items to products in the purchase.
        df["item_to_product_ratio"] = df["Nb_of_items"] / df["product_count"]
        # Create a column of the most common item in the purchase.
        df["most_common_item"] = df[
            [f"Nbr_of_prod_purchas{i}" for i in range(1, 25)]
        ].idxmax(axis=1)
        most_common_item = []
        for _, row in df.iterrows():
            match = re.search(r"(\d+)", row["most_common_item"])
            item_number = int(match.group(1))
            most_common_item.append(row[f"item{item_number}"])
        df["most_common_item"] = most_common_item
        df["most_common_item"] = df["most_common_item"].astype(str)

        # For each goods_code, count the number of times it appears in the dataset.
        for index, row in df.iterrows():
            for i in range(1, 25):
                if isinstance(row[f"goods_code{i}"], str):
                    df.at[index, f"goods_code{i}"] = self._counter[
                        row[f"goods_code{i}"]
                    ]
                else:
                    df.at[index, f"goods_code{i}"] = 0
        df[["goods_code" + str(i) for i in range(1, 25)]] = df[
            ["goods_code" + str(i) for i in range(1, 25)]
        ].astype(int)

        # Identify the columns to apply RNN tokenization
        categorical_columns = ["make", "item", "model"]  # Add more columns as needed
        categorical_columns = [
            col + str(i) for col in categorical_columns for i in range(1, 25)
        ]
        categorical_columns.append("most_common_item")

        # Identify the categorical and numerical columns
        numerical_columns = [
            col for col in df.columns if col not in set(categorical_columns)
        ]

        # Clean data
        for col in categorical_columns:
            df[col] = df[col].fillna("")
        for col in numerical_columns:
            df[col] = df[col].fillna(0)

        return df

## 2.2 Transformer Word2Vec


In [None]:
class Word2Vec(BaseEstimator, TransformerMixin):
    def __init__(self, num_words=None, **kwargs):
        self.num_words = num_words
        self.tokenizer = Tokenizer(num_words=num_words, **kwargs)

    def fit(self, X, y=None):
        self.Word2 = api.load("word2vec-google-news-300")
        return self

    def transform(self, X, y=None):
        x = np.array(X.values)
        for i in range(len(x)):
            for j in range(len(x[i])):
                tokens = x[i][j].split()
                embeddings = [
                    self.Word2[token]
                    for token in tokens
                    if token in self.Word2.key_to_index
                ]
                if len(embeddings) > 0:
                    mean = np.mean(embeddings)
                else:
                    mean = 0
                x[i][j] = mean
        return x

    def get_params(self, deep=True):
        return {"num_words": self.num_words}

## 2.3 Définition de la pipeline


In [None]:
def generate_pipeline() -> Pipeline:
    preprocessor = ColumnTransformer(
        transformers=[
            ("text_pipeline", Word2Vec(), make_column_selector(dtype_include=object)),
            ("num_pipeline", StandardScaler(), make_column_selector(dtype_include=np.number)),
        ],
    )

    return Pipeline(
        steps=[
            ("features_extract", FeaturesExtractor()),
            ("preprocess_static", preprocessor),
        ]
    )

## 2.4 Étude de la corrélation des données


In [None]:
def label_corr(X: np.ndarray, y: pd.Series) -> pd.DataFrame:
    columns = (
        [f"item{i}" for i in range(1, 25)]
        + [f"cash_price{i}" for i in range(1, 25)]
        + [f"make{i}" for i in range(1, 25)]
        + [f"model{i}" for i in range(1, 25)]
        + [f"goods_code{i}" for i in range(1, 25)]
        + [f"Nbr_of_prod_purchas{i}" for i in range(1, 25)]
        + [
            "Nb_of_items",
            "total_price",
            "avg_item_price",
            "max_item_price",
            "min_item_price",
            "price_variance",
            "product_count",
            "nb_unique_makes",
            "nb_unique_models",
            "item_to_product_ratio",
            "most_common_item",
        ]
    )
    df = pd.DataFrame(
        X,
        columns=columns,
    )
    df["fraud_flag"] = y
    return df.corr()[y.name].abs().sort_values(ascending=False)


label_corr(
    generate_pipeline().fit_transform(X_train_df, y_train_df),
    y_train_df,
).head(15)

## 2.5 Représentation TSNE


In [None]:
def plot_tsne(X: pd.DataFrame, y: pd.Series) -> None:
    X = generate_pipeline().fit_transform(X, y)
    data_norm = scale(X)
    data_tsne = TSNE().fit_transform(data_norm)
    data_tsne_df = pd.DataFrame(
        {
            "Dim1": data_tsne[:, 0],
            "Dim2": data_tsne[:, 1],
            "target": y,
        }
    )
    sns.scatterplot(data_tsne_df, x="Dim1", y="Dim2", hue="target")


plot_tsne(X_train_df, y_train_df)

La représentation TSNE et les faibles valeurs de corrélation linéaire données précédemment nous poussent à choisir un classifieur par arbre de décision comme le `RandomForest`, plutôt que des classifieurs linéaires.

# 3. Entraînement du modèle


## 3.1 Tuning des hyperparamètres


In [None]:
grid = GridSearchCV(
    Pipeline(
        steps=[
            ("preprocessor", generate_pipeline()),
            ("clf", RandomForestClassifier(random_state=0)),
        ]
    ),
    # param_grid={
    #     "clf__bootstrap": [True],
    #     "clf__n_estimators": [395, 400, 405],
    #     "clf__max_depth": [19, 20, 21],
    #     "clf__min_samples_split": [2, 3, 5, 7],
    #     "clf__min_samples_leaf": [1, 2, 3],
    #     "clf__class_weight": [None, "balanced"],
    # },
    
    # Pour réduire le temps d'éxecution, on ne recalcule pas les paramètres optimaux.
    param_grid={
        "clf__bootstrap": [True],
        "clf__n_estimators": [400],
        "clf__max_depth": [20],
        "clf__min_samples_split": [5],
        "clf__min_samples_leaf": [1],
        "clf__class_weight": [None],
    },
    scoring="average_precision",
    cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0),
    n_jobs=-1,
    verbose=True,
)

## 3.2 Split des données


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_df, y_train_df, test_size=0.2, stratify=y_train_df, random_state=0
)
print(f"{y_train.value_counts()[1] / len(y_train):.5%} de fraudes dans le train set.")
print(f"{y_test.value_counts()[1] / len(y_test):.5%} de fraudes dans le test set.")

## 3.3 Entraînement du modèle


In [None]:
grid.fit(X_train, y_train)

In [None]:
best_params = {k.replace("clf__", ""):v for k,v in grid.best_params_.items()}
print(f"Meilleurs paramètres du GridSearch: {best_params}")
print(f"Meilleur score du GridSearch: {grid.best_score_:0.5%}")

## 3.4 Pipeline finale


In [None]:
rf_pipeline = make_pipeline(
    generate_pipeline(),
    RandomForestClassifier(random_state=0, n_jobs=-1, **best_params),
)

## 3.5 Évaluation


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_df, y_train_df, test_size=0.2, stratify=y_train_df, random_state=0
)
rf_pipeline.fit(X_train, y_train)

In [None]:
metrics.PrecisionRecallDisplay.from_estimator(
    rf_pipeline, X_test, y_test, name="RandomForest", plot_chance_level=True
);

## 3.6 Prédiction sur les données de test


In [None]:
def make_test_predictions(
    X_train_df: pd.DataFrame,
    y_train_df: pd.Series,
    X_test_df: pd.DataFrame,
    rf_pipeline: Pipeline,
) -> pd.DataFrame:
    rf_pipeline.fit(X_train_df, y_train_df)
    out = rf_pipeline.predict_proba(X_test_df)
    out_ = out[:, 1]
    IDs = pd.read_csv(X_test_file)["ID"]
    df = pd.DataFrame({"ID": IDs, "fraud_flag": out_})
    df = df.reset_index()
    df.to_csv("out.csv", index=False)