In [None]:
# coding=utf-8
#
# The copyright of this file belongs to Feedzai. The file cannot be
# reproduced in whole or in part, stored in a retrieval system,
# transmitted in any form, or by any means electronic, mechanical,
# photocopying, or otherwise, without the prior permission of the owner.
#
# (c) 2022 Feedzai, Strictly Confidential

In [None]:
import numpy as np   # Seed generation
import pandas as pd  # Matrix operations

In [None]:
# Reading the 2.5M sample:
large_sample_path = "<2.5M_sample_path>"
large_sample_df = pd.read_parquet(large_sample_path)

# Reading the original (with same preprocessed features) dataset:
original_sample_path = "<original_sample_path>"
original_sample_df = pd.read_parquet(original_sample_path)

In [None]:
# Obtain month frequency and fraud prevalence per month (on original data).
month_frequency =  original_sample_df["month"].value_counts(normalize=True).to_dict()
month_fraud_prev = original_sample_df.groupby("month")["fraud_bool"].mean().to_dict()
# We cast to dict in order to facilitate the next operations.

In [None]:
# Calculating the expected number of positive and negative instances,
# per month, given the observed month frequency and prevalence.

sample_size = 1e6

expected_positives = {}
expected_negatives = {}

for month in month_fraud_prev.keys():
    expected_positives[month] = round(sample_size * month_frequency[month] * month_fraud_prev[month], 0)
    expected_negatives[month] = round(sample_size * month_frequency[month] * (1-month_fraud_prev[month]), 0)

In [None]:
# Sampling the "Base" dataset: Same month frequency and fraud rate per month.
base_dfs = []

SEED = 42

num_months = len(large_sample_df["month"].unique())
seed_possible_values = list(range(1_000_000))
seed_list = np.random.choice(seed_possible_values, size=num_months, replace=False)

for month, seed in zip(large_sample_df["month"].unique(), seed_list):
    positive_pool = large_sample_df[(large_sample_df["month"]==month) & (large_sample_df["fraud_bool"]==1)]
    negative_pool = large_sample_df[(large_sample_df["month"]==month) & (large_sample_df["fraud_bool"]==0)]
    
    positive_sample = positive_pool.sample(expected_positives[month], random_state=seed)
    negative_sample = negative_pool.sample(expected_negatives[month], random_state=seed+SEED)
    
    final_dfs.extend([positive_sample, negative_sample])

In [None]:
# Concatenate the filtered samples to obtain the final dataset.
base_df = pd.concat(base_dfs)

In [None]:
# Now generating the biased samples.
# We will start by defining the protected groups.
large_sample_df["group"] = (large_sample_df["customer_age"] > 50).map({True:"Minority", False: "Majority"})

In [None]:
# Helper method to define the joint probability of each combination of
# group and label.

def calculate_probabilities(
    original_prevalence: float,
    prev_ratio: float,
    maj_pct: float,
):
    # Probability notation (p_maj = P(A=maj))
    p_maj = maj_pct
    p_min = 1 - p_maj

    # Calculate prevalence for each class
    prev_min = original_prevalence / (prev_ratio * p_maj + (1 - p_maj))
    prev_maj = prev_ratio * prev_min

    # Calculate joint and conditional probabilities of majority group
    p_maj_and_pos = prev_maj * p_maj
    p_maj_giv_pos: float = p_maj_and_pos / original_prevalence
    p_maj_and_neg = p_maj - p_maj_and_pos
    p_maj_giv_neg: float = p_maj_and_neg / (1 - original_prevalence)

    # Calculate joint and conditional probabilities of minority group
    p_min_and_pos = prev_min * p_min
    p_min_giv_pos: float = p_min_and_pos / original_prevalence
    p_min_and_neg = p_min - p_min_and_pos
    p_min_giv_neg: float = p_min_and_neg / (1 - original_prevalence)

    return p_min_and_pos, p_maj_and_pos, p_min_and_neg, p_maj_and_neg

In [None]:
# Helper method to obtain a dataframe from given group, month and label.
def get_filtered_df(large_sample_df, group, month, label):
    return large_sample_df[
        (large_sample_df["month"]==month) & 
        (big_sample_df["group"]==group) & 
        (big_sample_df["fraud_bool"]==label)]


# Method to generate a biased sample controling group size or prevalence (fraud rate)
def group_prevalence_disparity(large_sample_df, original_sample_df, majority_size, fraud_rate_disparity):
    seed_list = np.random.choice(seed_possible_values, size=num_months, replace=False)

    bias_dfs = []
    
    # Allow for different majority sizes/fraud rates depending on the month of data.
    # This replicates a value if only one is passed.
    if isinstance(majority_size, float):
        majority_size=[majority_size]*original_sample_df["month"].unique().shape[0] 
    if isinstance(fraud_rate_disparity, (int, float)):
        fraud_rate_disparity=[fraud_rate_disparity]*original_sample_df["month"].unique().shape[0]

    for month, seed, maj_size, fr_disp in zip(large_sample_df["month"].unique(), seed_list, majority_size, fraud_rate_disparity):
        month_prevalence = original_sample_df[original_sample_df["month"]==month]["fraud_bool"].mean()
        (
            p_min_and_pos, 
            p_maj_and_pos, 
            p_min_and_neg, 
            p_maj_and_neg,
        ) = calculate_probabilities(month_prevalence, 1/fr_disp, maj_size)

        month_size = original_sample_df["month"].value_counts(normalize=True)[month]*sample_size

        # Calculate the needed amount of each combination of group/label to satisfy the disparities in month.
        n_minority_positive = round(month_size*p_min_and_pos, 0)
        n_minority_negative = round(month_size*p_min_and_neg, 0)
        n_majority_positive = round(month_size*p_maj_and_pos, 0)
        n_majority_negative = round(month_size*p_maj_and_neg, 0)
        
        # Sample the large sample with expected values.
        bias_dfs.extend(
        [
            get_filtered_df(large_sample_df, "Minority", month, 1).sample(int(minority_positive), random_state=seed),
            get_filtered_df(large_sample_df, "Minority", month, 0).sample(int(minority_negative), random_state=seed+SEED),
            get_filtered_df(large_sample_df, "Majority", month, 1).sample(int(majority_positive), random_state=seed+2*SEED),
            get_filtered_df(large_sample_df, "Majority", month, 0).sample(int(majority_negative), random_state=seed+3*SEED), 
        ]
        )

    return pd.concat(final_bias_dfs)

In [None]:
# Params for the generated sample
majority_size = 0.9      # Relative size of the majority group
fraud_rate_disparity = 1 # fraud prevalence in minority / fraud prevalence in majority

# For Type I we want to test group size disparity. 
# Majority will have 90% of instances, Minority 10% of instances.

In [None]:
typeI_df = group_prevalence_disparity(large_sample_df, original_sample_df, majority_size, fraud_rate_disparity)

In [None]:
# Params for the generated sample
majority_size = 0.5      # Relative size of the majority group
fraud_rate_disparity = 5 # fraud prevalence in minority / fraud prevalence in majority

# For Type II we want to test prevalence disparity. 
# Minority will have 5 times more fraud, when compared to Majority.

In [None]:
typeII_df = group_prevalence_disparity(large_sample_df, original_sample_df, majority_size, fraud_rate_disparity)

In [None]:
# Helper class that wraps the logic of the multivariate normal distribution mean calculation.
from mvn import TypeIIIBiasSampler

In [None]:
# Params for the generated sample
majority_size = 0.5      # Relative size of the majority group
fraud_rate_disparity = 1 # fraud prevalence in minority / fraud prevalence in majority

typeIII_df = group_prevalence_disparity(large_sample_df, original_sample_df, majority_size, fraud_rate_disparity)
# For the TypeIII, we want only to change the separability of the groups.

In [None]:
# To do so, we use the wrapper for the MVN distributions.

bias_sampler = TypeIIIBiasSampler("fraud_bool", "group", 0.9, 0.05, protected_attribute_values=["Majority", "Minority"])
bias_sampler(typeIII_df)  # This operation is inplace and injects the bias to new columns.

In [None]:
# For TypeIV, we control the prevalence to be higher in train months:
majority_size = 0.9
fraud_rate_disparity = [5, 5, 5, 5, 5, 5, 1, 1]  # First 6 months for train, last 2 months for test.

typeIV_df = group_prevalence_disparity(large_sample_df, original_sample_df, majority_size, fraud_rate_disparity)

In [None]:
# Helper method to obtain last dataset, with differences in separability by month.
def separability_disparities(large_sample_df, recalls, seed=42):
    final_bias_dfs = []
    df = group_prevalence_disparity(large_sample_df, 0.5, 1)
    for month, recall in zip(sorted(df["month"].unique()), recalls):
        # For each month we will create a different bias sampler, with the defined separability.
        bias_sampler = TypeIIIBiasSampler(
            "fraud_bool",
            "group",
            recall,
            0.05,
            protected_attribute_values=["Majority", "Minority"],
            seed=month 
        )
        final_bias_dfs.append(bias_sampler(df[df["month"] ==month]))
    return pd.concat(final_bias_dfs)

In [None]:
# The separabilities are defined to be high in train data and negligible in test data.
separability = [0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.05, 0.05]

typeV_df = separability_disparities(large_sample_df, separability)