# Prediction for real usecase

In [14]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# Load xgb model
with open('../results/models/xgb_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

# Load test data
X_test = pd.read_csv('../data/processed/X_test_xgb.csv')

# Load fee data
df_fees = pd.read_excel('../data/raw/PSP_Fees.xlsx')

In [15]:
def replicate_dataframe_with_values(df, col, new_value):
    """
    Replicates the DataFrame df for each value in N and sets the 'PSP' column
    with the corresponding value.
    
    Parameters:
    - df: Original DataFrame
    - col: Name of the column to update
    - new_values: List or array of values to set in the specified column
    
    Returns:
    - concatenated_df: A DataFrame containing all the replicated DataFrames
    """
    dfs = []  # List to store the replicated DataFrames

    for value in new_value:
        new_df = df.copy()  # Create a copy of the original DataFrame
        new_df[col] = value  # Set the column with the new value
        dfs.append(new_df)  # Add the new DataFrame to the list

    # Concatenate all the replicated DataFrames into one
    concatenated_df = pd.concat(dfs, ignore_index=True)

    return concatenated_df

In [16]:
# Expand data for all PSP
label_encoder = LabelEncoder()

X_test['ID'] = X_test.index
df_test_psp = replicate_dataframe_with_values(X_test, 'PSP', new_value=['UK_Card', 'Simplecard', 'Moneycard', 'Goldcard'])

# Merge the model data with the fees data
df_test_psp = pd.merge(df_test_psp, df_fees.loc[df_fees['success'] == 1, ['PSP','fee']],
                       how='left', on='PSP')
df_test_psp['PSP_txt'] = df_test_psp['PSP']

# Prepare data for modelinput
df_test_psp['PSP'] = label_encoder.fit_transform(df_test_psp['PSP'].astype(str))

# Get the probabilities of success
df_test_psp['prob_xgb'] = xgb_model.predict_proba(df_test_psp.drop(columns=['ID', 'fee', 'PSP_txt']))[:, 1]
df_test_psp['decision_xgb'] = xgb_model.predict(df_test_psp.drop(columns=['ID', 'fee', 'PSP_txt', 'prob_xgb']))

In [17]:
# Calculate expected value per ID
fee_factor = 0.1
df_test_psp['prob_fee_rario'] = ((1-fee_factor) * (1-df_test_psp['prob_xgb'])) + (fee_factor * df_test_psp['fee'] / 10)
df_test_psp[df_test_psp['ID'] == 26933]

# Idee: prob_xgb mit einem Faktor anhand der Fee verändern -> Kann vom Fachbereich gesetzt werden
# Balance zwischen erfolgreichen Zahlungen und Gebühren

# Idee Controlling:
# Durchschnittliche Gebühren und Erfolg je Tag oder Woche
# Dann Vergleich mit neuem Modell
# Grafische Analyse + Signifikanz etc.

# Idee: Business Entscheidung kann auch anhand der Umfang der Transaktionen getroffen werden
# Bsp.: amount > 1000 -> PSP mit höherer Gebühr

Unnamed: 0,country,amount,PSP,3D_secured,card,previous_failure,attempts,weekday,hour,time_of_day,country_PSP,PSP_3D_secured,amount_group,ID,fee,PSP_txt,prob_xgb,decision_xgb,prob_fee_rario
