# Prediction for real usecase

In [53]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# Load xgb model
with open('../results/models/xgb_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

# Load test data
X_test = pd.read_csv('../data/processed/X_test_xgb.csv')

# Load fee data
df_fees = pd.read_excel('../data/raw/PSP_Fees.xlsx')

In [5]:

def replicate_dataframe_with_values(df, col, new_values):
    """
    Replicates the DataFrame df for each value in new_values and sets the 'col' column
    with the corresponding new value, while also preserving the original value in a new column.
    
    Parameters:
    - df: Original DataFrame
    - col: Name of the column to update
    - new_values: List or array of values to set in the specified column
    
    Returns:
    - concatenated_df: A DataFrame containing all the replicated DataFrames with the original 
      and new values of the specified column.
    """
    dfs = []  # List to store the replicated DataFrames

    # Create a new column to store the original values
    original_col_name = f"original_{col}"
    df[original_col_name] = df[col]

    for value in new_values:
        new_df = df.copy()  # Create a copy of the original DataFrame
        new_df[col] = value  # Set the column with the new value
        dfs.append(new_df)  # Add the new DataFrame to the list

    # Concatenate all the replicated DataFrames into one
    concatenated_df = pd.concat(dfs, ignore_index=True)

    return concatenated_df

In [15]:
# Expand data for all PSP
label_encoder = LabelEncoder()

X_test['ID'] = X_test.index
df_test_psp = replicate_dataframe_with_values(X_test, 'PSP', new_values=['UK_Card', 'Simplecard', 'Moneycard', 'Goldcard'])

# Merge the model data with the fees data
df_test_psp = pd.merge(df_test_psp, df_fees.loc[df_fees['success'] == 1, ['PSP','fee']],
                       how='left', on='PSP')
df_test_psp['PSP_txt'] = df_test_psp['PSP']

# Prepare data for modelinput
df_test_psp['PSP'] = label_encoder.fit_transform(df_test_psp['PSP'].astype(str))

# Get the probabilities of success
df_test_psp['prob_xgb'] = xgb_model.predict_proba(df_test_psp.drop(columns=['ID', 'fee', 'PSP_txt', 'original_PSP']))[:, 1]
df_test_psp['decision_xgb'] = xgb_model.predict(df_test_psp.drop(columns=['ID', 'fee', 'PSP_txt', 'original_PSP', 'prob_xgb']))

In [39]:
# Explaination prop_fee_ratio:
# The lowest value will be the decision
# The higher the factor, the more important the level of fees becomes
fee_factor = 0.3

# Calculate expected value per ID
df_test_psp['prob_fee_ratio'] = (((1-fee_factor) * (1-df_test_psp['prob_xgb'])) + 
                                 (fee_factor * df_test_psp['fee'] / 10))

# Example for ID 10
df_test_psp[df_test_psp['ID'] == 10]

Unnamed: 0,country,amount,PSP,3D_secured,card,previous_failure,attempts,weekday,hour,time_of_day,...,prob_fee_ratio_original,fee_original,diff_current,diff_best_prop_fee,prob_fee_ratio_original.1,fee_original.1,prob_fee_ratio_original.2,fee_original.2,prob_fee_ratio_original.3,fee_original.3
10,1,259,3,0,0,1,7,1,12,0,...,0.811426,3.0,0.0,0.097809,0.839839,3.0,0.754601,3.0,0.697776,3.0
12613,1,259,2,0,0,1,7,1,12,0,...,0.811426,3.0,0.078502,0.20331,0.839839,3.0,0.754601,3.0,0.697776,3.0
25216,1,259,1,0,0,1,7,1,12,0,...,0.811426,3.0,0.032498,0.078641,0.839839,3.0,0.754601,3.0,0.697776,3.0
37819,1,259,0,0,0,1,7,1,12,0,...,0.811426,3.0,0.074246,0.0,0.839839,3.0,0.754601,3.0,0.697776,3.0


In [40]:
# Calculate the sum fee of the highest prob_xgb per ID
df_sorted = df_test_psp.sort_values(by=['ID', 'prob_xgb'], ascending=[True, False])
df_max_prob_xgb = df_sorted.drop_duplicates(subset='ID')

# Sum the 'fee' values of these rows
print(df_max_prob_xgb['fee'].sum())
print(df_test_psp[df_test_psp['original_PSP'] == df_test_psp['PSP']]['fee'].sum())

print(df_max_prob_xgb['prob_xgb'].mean())
print(df_test_psp[df_test_psp['original_PSP'] == df_test_psp['PSP']]['prob_xgb'].mean())

116169.0
41532.0
0.38973
0.20229283


In [41]:
# Calculate the sum fee of the highest prob_xgb per ID
df_sorted = df_test_psp.sort_values(by=['ID', 'prob_fee_ratio'], ascending=[False,True])
df_max_prob_fee_ratio = df_sorted.drop_duplicates(subset='ID')

# Sum the 'fee' values of these rows
print(df_max_prob_fee_ratio['fee'].sum())
print(df_test_psp[df_test_psp['original_PSP'] == df_test_psp['PSP']]['fee'].sum())

print(df_max_prob_fee_ratio['prob_xgb'].mean())
print(df_test_psp[df_test_psp['original_PSP'] == df_test_psp['PSP']]['prob_xgb'].mean())

28147.0
41532.0
0.2327001
0.20229283


In [54]:
# Define a list of fee_factor values
fee_factors = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

# Initialize an empty list to store results
results = []

# Loop over each fee_factor value
for fee_factor in fee_factors:
    # Calculate prob_fee_ratio
    df_test_psp['prob_fee_ratio'] = (((1 - fee_factor) * (1 - df_test_psp['prob_xgb'])) +
                                     (fee_factor * df_test_psp['fee'] / 10))

    # Calculations for highest prob_fee_ratio per ID
    df_sorted_by_fee_ratio = df_test_psp.sort_values(by=['ID', 'prob_fee_ratio'], ascending=[False, True])
    highest_prob_fee_ratio_per_id = df_sorted_by_fee_ratio.drop_duplicates(subset='ID')
    total_fee_highest_prob_fee_ratio = highest_prob_fee_ratio_per_id['fee'].sum()
    average_prob_xgb_highest_fee_ratio = highest_prob_fee_ratio_per_id['prob_xgb'].mean()

    # Calculations for original PSP
    original_psp = df_test_psp[df_test_psp['original_PSP'] == df_test_psp['PSP']]
    total_fee_original_psp = original_psp['fee'].sum()
    average_prob_xgb_original_psp = original_psp['prob_xgb'].mean()

    # Append the results
    results.append({
        'fee_factor': fee_factor,
        'total_fee_highest_prob_fee_ratio': np.round(total_fee_highest_prob_fee_ratio, 2),
        'average_prob_xgb_highest_fee_ratio': np.round(average_prob_xgb_highest_fee_ratio, 2),
        'total_fee_original_psp': np.round(total_fee_original_psp, 2),
        'average_prob_xgb_original_psp': np.round(average_prob_xgb_original_psp, 2)
    })

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

In [55]:
results_df.head()

Unnamed: 0,fee_factor,total_fee_highest_prob_fee_ratio,average_prob_xgb_highest_fee_ratio,total_fee_original_psp,average_prob_xgb_original_psp
0,0.0,116169.0,0.39,41532.0,0.2
1,0.05,110410.0,0.39,41532.0,0.2
2,0.1,100652.0,0.38,41532.0,0.2
3,0.15,87192.0,0.37,41532.0,0.2
4,0.2,64888.0,0.33,41532.0,0.2


In [None]:
# Calculate the difference between the best PSP and the current PSP
df_test_psp['diff_best_prop_fee'] = df_test_psp.groupby('ID')['prob_fee_ratio'].transform(max) - df_test_psp['prob_fee_ratio']

# Set the value of the original PSP
original_values_df = df_test_psp[df_test_psp['original_PSP'] == df_test_psp['PSP']]
original_values_df = original_values_df[['ID', 'prob_fee_ratio', 'fee']].set_index('ID')
df_test_psp = df_test_psp.merge(original_values_df, on='ID', how='left', suffixes=('', '_original'))
df_test_psp['diff_current_prop_fee'] = df_test_psp['prob_fee_ratio_original'] - df_test_psp['prob_fee_ratio']

df_test_psp['diff_current_fee'] = df_test_psp['fee_original'] - df_test_psp['fee']

# Display rows where 'ID' is 10
df_test_psp[df_test_psp['ID'] == 10]