# Notebook 2 - Scenarios Rules based

Notre objectif ici sera de construire puis d'optimiser des scénarios à "règles explicites"

### On importe les librairies necessaires

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.problems.functional import FunctionalProblem
from pymoo.optimize import minimize
from pymoo.operators.crossover.pntx import TwoPointCrossover
from pymoo.operators.mutation.bitflip import BitflipMutation
# For random sampling in NSGA-II
from pymoo.operators.sampling.rnd import FloatRandomSampling
# For
from pymoo.operators.crossover.sbx import SBX
# For
from pymoo.operators.mutation.pm import PM
from pymoo.util.nds.non_dominated_sorting import NonDominatedSorting

import random
import itertools
from sklearn.model_selection import train_test_split

### On importe les données nécessaires

Les seules données dont on aura vraiment besoin ici est l'aggrégat que l'on a calculé dans le notebook 1

In [2]:
df = pd.read_csv("12M_trans/AML_features.csv")

features = [col for col in df.columns if col not in ['IS_FRAUD', 'ACCOUNT_ID']]

### Paramètres utilisateur

In [3]:
min_recall = 0.85
training_size = 0.8
rule_nb = 5

### Création de fonctions que l'on utilisera par la suite

In [4]:
def recall(thresholds, rule_str, dataset):
    """
    Calcule le rappel pour une règle donnée et un ensemble de thresholds.
    
    Parameters:
    thresholds (list): Une liste de valeurs de seuils [threshold_1, threshold_2, ..., threshold_n].
    rule_str (str): La règle à appliquer sous forme de chaîne de caractères. 
                    Les variables de seuil doivent être sous la forme R1, R2, etc.
    dataset (pd.DataFrame): Le dataset sur lequel appliquer la règle.
    
    Returns:
    float: Le rappel (recall) calculé.
    """
    
    if len(thresholds) != len(features):
        raise ValueError("Le nombre de thresholds doit correspondre au nombre de colonnes pertinentes.")

    conditions = {
        f'R{i+1}': f"(dataset['{col}'] >= {thresholds[i]})"
        for i, col in enumerate(features)
    }
    
    # Remplacer les placeholders R1, R2, etc., par les conditions réelles
    #for key, condition in conditions.items():
    #    rule_str = rule_str.replace(key, condition)
        
    # Replace placeholders with actual conditions, starting with the largest index
    for key in sorted(conditions.keys(), key=lambda k: len(k), reverse=True):
        rule_str = rule_str.replace(key, conditions[key])
    
    
    # Appliquer la règle au dataset en utilisant eval
    alerts = dataset[eval(rule_str)]
    
    # Calculer le nombre de fraudes détectées
    num_fraud_alerts = alerts[alerts['IS_FRAUD'] == 1].shape[0]
    total_sar = dataset[dataset['IS_FRAUD'] == 1].shape[0]
    # Calculer le rappel
    recall_value = num_fraud_alerts / total_sar
    return recall_value



def conversion_rate(thresholds, rule_str, dataset):
    """
    Calcule le taux de conversion pour une règle donnée et un ensemble de thresholds.
    
    Parameters:
    thresholds (list): Une liste de valeurs de seuils [threshold_1, threshold_2, ..., threshold_n].
    rule_str (str): La règle à appliquer sous forme de chaîne de caractères. 
                    Les variables de seuil doivent être sous la forme R1, R2, etc.
    dataset (pd.DataFrame): Le dataset sur lequel appliquer la règle.
    
    Returns:
    float: Le taux de conversion calculé.
    """
    
    if len(thresholds) != len(features):
        raise ValueError("Le nombre de thresholds doit correspondre au nombre de colonnes pertinentes.")

    conditions = {
        f'R{i+1}': f"(dataset['{col}'] >= {thresholds[i]})"
        for i, col in enumerate(features)
    }
    
    # Remplacer les placeholders R1, R2, etc., par les conditions réelles
    #for key, condition in conditions.items():
    #    rule_str = rule_str.replace(key, condition)
    
    # Replace placeholders with actual conditions, starting with the largest index
    for key in sorted(conditions.keys(), key=lambda k: len(k), reverse=True):
        rule_str = rule_str.replace(key, conditions[key])
    
    
    # Appliquer la règle au dataset en utilisant eval
    alerts = dataset[eval(rule_str)]
    
    # Calculer le nombre total d'alertes
    num_alerts = len(alerts)
    
    # Calculer le nombre de fraudes détectées
    num_fraud_alerts = alerts[alerts['IS_FRAUD'] == 1].shape[0]

    # Calculer le taux de conversion
    conversion_rate_value = num_fraud_alerts / num_alerts if num_alerts > 0 else 0
    
    return conversion_rate_value


def alert_volume(thresholds, rule_str, dataset):
    if len(thresholds) != len(features):
        raise ValueError("Le nombre de thresholds doit correspondre au nombre de colonnes pertinentes.")

    conditions = {
        f'R{i+1}': f"(dataset['{col}'] >= {thresholds[i]})"
        for i, col in enumerate(features)
    }
    
    # Remplacer les placeholders R1, R2, etc., par les conditions réelles
    #for key, condition in conditions.items():
    #    rule_str = rule_str.replace(key, condition)
    # Replace placeholders with actual conditions, starting with the largest index
    for key in sorted(conditions.keys(), key=lambda k: len(k), reverse=True):
        rule_str = rule_str.replace(key, conditions[key])
    
    
    
    # Appliquer la règle au dataset en utilisant eval
    alerts = dataset[eval(rule_str)]
    
    # Calculer le nombre total d'alertes
    num_alerts = len(alerts)
    
    return num_alerts


def generate_rules(df, n):
    num_features = len(features)
    rules = set()  # Using a set to ensure uniqueness
    
    # Start with the most restrictive rule (all AND)
    base_rule = ' & '.join([f'R{i+1}' for i in range(num_features)])
    rules.add(base_rule)

    # Generate less restrictive rules by introducing OR operators
    while len(rules) < n:
        rule = list(f'R{i+1}' for i in range(num_features))
        
        # Decide how many ANDs we will keep (hardest rule has all ANDs)
        num_ands = random.randint(1, max(1, num_features // 2))  # Less than half ANDs
        
        # Randomly replace some ANDs with ORs
        for _ in range(num_features - num_ands):
            idx = random.choice(range(len(rule) - 1))
            rule[idx] = f'({rule[idx]} | {rule.pop(idx + 1)})'
        
        # Join the final rule
        rule_str = ' & '.join(rule)
        rules.add(rule_str)

    return list(rules)


def compute_pareto_front(points):
    """Computes the Pareto front from a set of points."""
    pareto_front = []
    for i, point in enumerate(points):
        dominated = False
        for j, other in enumerate(points):
            if i != j:
                if all(other <= point) and any(other < point):
                    dominated = True
                    break
        if not dominated:
            pareto_front.append(point)
    return np.array(pareto_front)




def evaluate_thresholds(pareto_solutions, rule_str, train_set, test_set):
    """Evaluates recall and conversion rate for each threshold set on both train and test sets."""
    results = []
    
    for sol in pareto_solutions:
        # Calculate metrics on the training set
        train_recall = recall(sol, rule_str, train_set)
        train_conversion_rate = conversion_rate(sol, rule_str, train_set)
        
        # Calculate metrics on the test set
        test_recall = recall(sol, rule_str, test_set)
        test_conversion_rate = conversion_rate(sol, rule_str, test_set)
        
        # Store the results
        results.append({
            'solution': sol,
            'train_recall': train_recall,
            'train_conversion_rate': train_conversion_rate,
            'test_recall': test_recall,
            'test_conversion_rate': test_conversion_rate
        })
    
    # Sort by recall on the test set in descending order
    results_sorted = sorted(results, key=lambda x: x['test_recall'], reverse=True)
    
    return results_sorted



# 1 - Rule generator

### Génération des règles d'alerte

In [5]:
rules = generate_rules(df, rule_nb)

for i, rule in enumerate(rules, 1):
    print(f"Rule {i}: {rule}")

Rule 1: R1 & (R2 | (R3 | R4)) & R5 & (R6 | (R7 | ((R8 | R9) | R10))) & (R11 | (R12 | R13)) & ((R14 | (R15 | R16)) | (((R17 | R18) | ((R19 | R20) | (R21 | ((R22 | R23) | (R24 | R25))))) | R26))
Rule 2: R1 & R2 & (R3 | R4) & (R5 | R6) & (R7 | (R8 | R9)) & (((R10 | R11) | R12) | R13) & R14 & R15 & R16 & (R17 | R18) & (R19 | R20) & (R21 | (R22 | R23)) & ((R24 | R25) | R26)
Rule 3: (R1 | (R2 | (R3 | (R4 | R5)))) & R6 & R7 & (R8 | (R9 | R10)) & (R11 | R12) & (R13 | R14) & ((R15 | R16) | R17) & R18 & (R19 | R20) & (R21 | (R22 | R23)) & R24 & (R25 | R26)
Rule 4: R1 & R2 & R3 & R4 & R5 & R6 & R7 & R8 & R9 & R10 & R11 & R12 & R13 & R14 & R15 & R16 & R17 & R18 & R19 & R20 & R21 & R22 & R23 & R24 & R25 & R26
Rule 5: R1 & (((R2 | (R3 | R4)) | R5) | R6) & R7 & R8 & R9 & R10 & (R11 | R12) & ((R13 | R14) | R15) & (((R16 | R17) | R18) | R19) & ((R20 | R21) | R22) & R23 & ((R24 | R25) | R26)


### Génération des jeux de données

On utilise l'argument stratify pour être sûr que la proportion de SARs soit la même dans chaque set

In [6]:
# Assume X is your feature DataFrame and y is your target variable
X = df[features]
y = df["IS_FRAUD"]

# Step 1: Stratified split to create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=(1 - training_size), stratify=y, random_state=42
)

# Step 2: Join the features and target for each set
train_set = pd.concat([X_train, y_train], axis=1)
test_set = pd.concat([X_test, y_test], axis=1)

# 2 - Bruteforce Tuning via méthode des pourcentiles

In [None]:
def calculate_percentile_thresholds(dataset, features, percentiles):
    """Calculate the threshold values for each feature at given percentiles."""
    thresholds = {}
    for feature in features:
        thresholds[feature] = np.percentile(dataset[feature], percentiles[feature])
    return thresholds


# Initialize percentiles and thresholds
percentiles = {feature: 1 for feature in features}
thresholds_dict = calculate_percentile_thresholds(train_set, features, percentiles)

# Convert thresholds from dictionary to list
thresholds = [thresholds_dict[feature] for feature in features]

for rule_str in rules[:1]:
    while True:
        can_update = False
        for feature in features:
            # Convert thresholds from dictionary to list before evaluating recall
            thresholds = [thresholds_dict[feat] for feat in features]

            current_recall = recall(thresholds, rule_str, train_set)
            if current_recall < min_recall:
                break  # Stop if recall is already below the limit

            # Try incrementing the percentile for the current feature
            if percentiles[feature] < 100:  # Ensure we're within bounds
                new_percentiles = percentiles.copy()
                new_percentiles[feature] += 1
                new_thresholds_dict = calculate_percentile_thresholds(train_set, features, new_percentiles)

                # Convert new_thresholds from dictionary to list
                new_thresholds = [new_thresholds_dict[feat] for feat in features]

                new_recall = recall(new_thresholds, rule_str, train_set)

                if new_recall >= min_recall:
                    percentiles = new_percentiles
                    thresholds_dict = new_thresholds_dict
                    can_update = True  # We made an update, so we continue the loop

        if not can_update:
            break  # Exit the loop if no further updates can be made

    # Convert final thresholds from dictionary to list
    thresholds = [thresholds_dict[feature] for feature in features]

    # Evaluate the final thresholds on the test set
    test_recall = recall(thresholds, rule_str, test_set)
    test_conversion_rate = conversion_rate(thresholds, rule_str, test_set)

    print(f"Final thresholds: {thresholds_dict}")
    print(f"Train Recall: {recall(thresholds, rule_str, train_set)}")
    print(f"Test Recall: {test_recall}, Test Conversion Rate: {test_conversion_rate}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the min_recall values to test
min_recall_values = np.linspace(0.85, 0.99, num=15)

# Initialize lists to store recall and conversion rate
recall_list = []
conversion_rate_list = []

# Iterate over each min_recall value
for min_recall in min_recall_values:
    # Initialize percentiles and thresholds
    percentiles = {feature: 1 for feature in features}
    thresholds_dict = calculate_percentile_thresholds(train_set, features, percentiles)
    
    # Main loop for adjusting thresholds
    for rule_str in rules[:1]:  # Assuming you are using only the first rule
        while True:
            can_update = False
            for feature in features:
                # Convert thresholds from dictionary to list before evaluating recall
                thresholds = [thresholds_dict[feat] for feat in features]

                current_recall = recall(thresholds, rule_str, train_set)
                if current_recall < min_recall:
                    break  # Stop if recall is already below the limit

                # Try incrementing the percentile for the current feature
                if percentiles[feature] < 100:  # Ensure we're within bounds
                    new_percentiles = percentiles.copy()
                    new_percentiles[feature] += 1
                    new_thresholds_dict = calculate_percentile_thresholds(train_set, features, new_percentiles)

                    # Convert new_thresholds from dictionary to list
                    new_thresholds = [new_thresholds_dict[feat] for feat in features]

                    new_recall = recall(new_thresholds, rule_str, train_set)

                    if new_recall >= min_recall:
                        percentiles = new_percentiles
                        thresholds_dict = new_thresholds_dict
                        can_update = True  # We made an update, so we continue the loop

            if not can_update:
                break  # Exit the loop if no further updates can be made

        # Convert final thresholds from dictionary to list
        thresholds = [thresholds_dict[feature] for feature in features]

        # Evaluate the final thresholds on the test set
        test_recall = recall(thresholds, rule_str, test_set)
        test_conversion_rate = conversion_rate(thresholds, rule_str, test_set)
        
        # Store the recall and conversion rate
        recall_list.append(min_recall)
        conversion_rate_list.append(test_conversion_rate)

# Plot Recall vs. Conversion Rate
plt.figure(figsize=(10, 6))
plt.plot(recall_list, conversion_rate_list, marker='o')
plt.title('Recall vs. Conversion Rate')
plt.xlabel('Recall')
plt.ylabel('Conversion Rate')
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the min_recall values to test
min_recall_values = np.linspace(0.85, 0.99, num=15)

# Initialize lists to store recall and conversion rate
recall_list = []
conversion_rate_list = []

# Iterate over each min_recall value
for min_recall in min_recall_values:
    print(f"Starting optimization for min_recall: {min_recall:.2f}")
    
    # Initialize percentiles and thresholds
    percentiles = {feature: 1 for feature in features}
    thresholds_dict = calculate_percentile_thresholds(train_set, features, percentiles)
    
    # Main loop for adjusting thresholds
    for rule_str in rules[:1]:  # Assuming you are using only the first rule
        while True:
            can_update = False
            for feature in features:
                # Convert thresholds from dictionary to list before evaluating recall
                thresholds = [thresholds_dict[feat] for feat in features]

                current_recall = recall(thresholds, rule_str, train_set)
                if current_recall < min_recall:

                    break  # Stop if recall is already below the limit

                # Try incrementing the percentile for the current feature
                if percentiles[feature] < 100:  # Ensure we're within bounds
                    new_percentiles = percentiles.copy()
                    new_percentiles[feature] += 1
                    new_thresholds_dict = calculate_percentile_thresholds(train_set, features, new_percentiles)

                    # Convert new_thresholds from dictionary to list
                    new_thresholds = [new_thresholds_dict[feat] for feat in features]

                    new_recall = recall(new_thresholds, rule_str, train_set)

                    if new_recall >= min_recall:
                        percentiles = new_percentiles
                        thresholds_dict = new_thresholds_dict
                        can_update = True  # We made an update, so we continue the loop


            if not can_update:

                break  # Exit the loop if no further updates can be made

        # Convert final thresholds from dictionary to list
        thresholds = [thresholds_dict[feature] for feature in features]

        # Evaluate the final thresholds on the test set
        test_recall = recall(thresholds, rule_str, test_set)
        test_conversion_rate = conversion_rate(thresholds, rule_str, test_set)
        
        # Store the recall and conversion rate
        recall_list.append(min_recall)
        conversion_rate_list.append(test_conversion_rate)
        
        # Print final results for this min_recall
        print(f"Test results for min_recall {min_recall:.2f}: Test Recall = {test_recall:.4f}, Test Conversion Rate = {test_conversion_rate:.4f}")

# Plot Recall vs. Conversion Rate
plt.figure(figsize=(10, 6))
plt.plot(recall_list, conversion_rate_list, marker='o')
plt.title('Recall vs. Conversion Rate')
plt.xlabel('Recall')
plt.ylabel('Conversion Rate')
plt.grid(True)
plt.show()


# 3 - Tuning via algorithme génétique d'optimisation bi-objectif

In [9]:
# Define the algorithm
algorithm = NSGA2(pop_size=100,
                  sampling= FloatRandomSampling(),
                  crossover= SBX(prob_var=0.9, eta=20, prob_exch=1.0),
                  mutation= PM(eta=20),
                  eliminate_duplicates= True)

# Define the stopping criterion
stop_criteria = ('n_gen', 100)

for rule_str in rules[:1]:
    
    ##############################################################################
    ####################### Phase d'entrainement #################################
    ##############################################################################
    
    # On créé les bornes inférieures du problème
    xl = np.array([train_set[col].min() for col in features])

    # On crée les bornes supérieures du problème.
    xu = np.array([train_set[col].max() for col in features])

    # Define the problem using FunctionalProblem
    problem_train = FunctionalProblem(
        # Nombre de features
        n_var=len(features),  
        # Définition des objectifs
        # On ajoute un moins devant chaque fonction car pymoo ne peut que minimiser, et non maximiser.
        objs=[lambda x: -conversion_rate(x, rule_str, train_set), 
              lambda x: -recall(x, rule_str, train_set)],
        constr_ieq=[lambda x: -(recall(x, rule_str, train_set) - min_recall)], 
        xl=xl,
        xu=xu
    )

    res_train = minimize(problem_train, algorithm, stop_criteria, seed=1, verbose=True)

    # Extract Pareto optimal solutions
    pareto_solutions_train = res_train.X
    pareto_objectives_train = res_train.F
    
    # Evaluate each Pareto solution on the test set
    pareto_objectives_test = []
    for sol in pareto_solutions_train:
        test_conversion_rate = conversion_rate(sol, rule_str, test_set)
        test_recall = recall(sol, rule_str, test_set)
        pareto_objectives_test.append([-test_conversion_rate, -test_recall])  # Note the negative for minimization
    
    pareto_objectives_test = np.array(pareto_objectives_test)
    
    # Compute the Pareto front on the test set manually
    pareto_front_test = compute_pareto_front(pareto_objectives_test)

    # Plotting the Pareto fronts
    plt.figure(figsize=(10, 6))
    
    # Training Pareto front
    plt.scatter(-pareto_objectives_train[:, 0], -pareto_objectives_train[:, 1], color='blue', label='Train Pareto Front')
    
    # Test Pareto front
    if len(pareto_front_test) > 0:
        plt.scatter(-pareto_front_test[:, 0], -pareto_front_test[:, 1], color='red', label='Test Pareto Front')
    
    plt.title(f'Pareto Fronts for Rule: {rule_str}')
    plt.xlabel('Conversion Rate')
    plt.ylabel('Recall')
    plt.legend()
    plt.grid(True)
    plt.show()


n_gen  |  n_eval  | n_nds  |     cv_min    |     cv_avg    |      eps      |   indicator  
     1 |      100 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
     2 |      200 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
     3 |      300 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
     4 |      400 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
     5 |      500 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
     6 |      600 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
     7 |      700 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
     8 |      800 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
     9 |      900 |      1 |  0.8500000000 |  0.8500000000 |             - |             -
    10 |     1000 |      1 |  0.8500000000 |  0.8500000000 |             - |             -

KeyboardInterrupt: 

## Define the stop criterion

## Run the algorithm